swscale/x86/swscale: Remove obsolete and harmful MMX(EXT) functions

x64 always has MMX, MMXEXT, SSE and SSE2 and this means that some functions for MMX, MMXEXT, SSE and 3dnow are always overridden by other functions (unless one e.g. explicitly disables SSE2). So given that the only systems that benefit from these functions are truely ancient 32bit x86s they are removed. Moreover, some of the removed code was buggy/not bitexact and lead to failures involving the f32le and f32be versions of gray, gbrp and gbrap on x86-32 when SSE2 was not disabled. See e.g. https://fate.ffmpeg.org/report.cgi?time=20220609221253&slot=x86_32-debian-kfreebsd-gcc-4.4-cpuflags-mmx Notice that yuv2yuvX_mmx is not removed, because it is used by SSE3 and AVX2 as fallback in case of unaligned data and also for tail processing. I don't know why yuv2yuvX_mmxext isn't being used for this; an earlier version [1] of 554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f used it, but the version that was eventually applied does not. [1]: https://ffmpeg.org/pipermail/ffmpeg-devel/2020-November/272124.html Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
author: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> 2022-06-09 16:57:34 +0200
committer: Andreas Rheinhardt <andreas.rheinhardt@outlook.com> 2022-06-22 13:36:04 +0200
commit: a05f22eaf393177b94432431c145cbc5ba10390a (patch)
tree: 31a2ae01b520b2578477ea4b6b2febf1d0e6c1e6 /libswscale/x86/scale.asm
parent: 2831837182fe26f0a19a4d366f3f0553311f1291 (diff)
1 files changed, 8 insertions, 37 deletions
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index 83cabff722..c62ae3dcc2 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -61,13 +61,11 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
 %define mov32 mov
 %endif ; x86-64
 %if %2 == 19
-%if mmsize == 8 ; mmx
-    mova          m2, [max_19bit_int]
-%elif cpuflag(sse4)
+%if cpuflag(sse4)
     mova          m2, [max_19bit_int]
 %else ; ssse3/sse2
     mova          m2, [max_19bit_flt]
-%endif ; mmx/sse2/ssse3/sse4
+%endif ; sse2/ssse3/sse4
 %endif ; %2 == 19
 %if %1 == 16
     mova          m6, [minshort]
@@ -144,12 +142,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
     pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
 
     ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
-%if mmsize == 8 ; mmx
-    movq          m4, m0
-    punpckldq     m0, m1
-    punpckhdq     m4, m1
-    paddd         m0, m4
-%elif notcpuflag(ssse3) ; sse2
+%if notcpuflag(ssse3) ; sse2
     mova          m4, m0
     shufps        m0, m1, 10001000b
     shufps        m4, m1, 11011101b
@@ -159,7 +152,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
                                                 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
                                                 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
                                                 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
-%endif ; mmx/sse2/ssse3/sse4
+%endif ; sse2/ssse3/sse4
 %else ; %3 == 8, i.e. filterSize == 8 scaling
     ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
     mov32      pos0q, dword [fltposq+wq*2+0]    ; filterPos[0]
@@ -197,14 +190,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
     pmaddwd       m5, [filterq+wq*8+mmsize*3]   ; *= filter[{24,25,..,30,31}]
 
     ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
-%if mmsize == 8
-    paddd         m0, m1
-    paddd         m4, m5
-    movq          m1, m0
-    punpckldq     m0, m4
-    punpckhdq     m1, m4
-    paddd         m0, m1
-%elif notcpuflag(ssse3) ; sse2
+%if notcpuflag(ssse3) ; sse2
 %if %1 == 8
 %define mex m6
 %else
@@ -233,7 +219,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
                                                 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
                                                 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
                                                 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
-%endif ; mmx/sse2/ssse3/sse4
+%endif ; sse2/ssse3/sse4
 %endif ; %3 == 4/8
 
 %else ; %3 == X, i.e. any filterSize scaling
@@ -274,7 +260,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
     mov         srcq, srcmemmp
 
 .innerloop:
-    ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
+    ; load 2x8 (sse) source pixels into m0/m1 -> m4/m5
     movbh         m0, [srcq+ pos0q     *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
     movbh         m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
 %if %1 == 8
@@ -319,12 +305,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
 
     lea      filterq, [filterq+(fltsizeq+dlt)*2]
 
-%if mmsize == 8 ; mmx
-    movq          m0, m4
-    punpckldq     m4, m5
-    punpckhdq     m0, m5
-    paddd         m0, m4
-%else ; mmsize == 16
 %if notcpuflag(ssse3) ; sse2
     mova          m1, m4
     punpcklqdq    m4, m5
@@ -344,7 +324,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
     phaddd        m4, m4
     SWAP           0, 4
 %endif ; sse2/ssse3/sse4
-%endif ; mmsize == 8/16
 %endif ; %3 ==/!= X
 
 %if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
@@ -372,7 +351,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
 %endif ; %3 ==/!= X
 %endif ; %2 == 15/19
 %ifnidn %3, X
-    add           wq, (mmsize<<wshr)/4          ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
+    add           wq, (mmsize<<wshr)/4          ; both 8tap and 4tap really only do 4 pixels
                                                 ; per iteration. see "shl wq,1" above as for why we do this
 %else ; %3 == X
     add           wq, 2
@@ -385,12 +364,8 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
 %macro SCALE_FUNCS 3
 SCALE_FUNC %1, %2, 4, 4,  6, %3
 SCALE_FUNC %1, %2, 8, 8,  6, %3
-%if mmsize == 8
-SCALE_FUNC %1, %2, X, X,  7, %3
-%else
 SCALE_FUNC %1, %2, X, X4, 7, %3
 SCALE_FUNC %1, %2, X, X8, 7, %3
-%endif
 %endmacro
 
 ; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
@@ -411,10 +386,6 @@ SCALE_FUNCS 14, 19, %2
 SCALE_FUNCS 16, 19, %3
 %endmacro
 
-%if ARCH_X86_32
-INIT_MMX mmx
-SCALE_FUNCS2 0, 0, 0
-%endif
 INIT_XMM sse2
 SCALE_FUNCS2 7, 6, 8
 INIT_XMM ssse3
author	Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2022-06-09 16:57:34 +0200
committer	Andreas Rheinhardt <andreas.rheinhardt@outlook.com>	2022-06-22 13:36:04 +0200
commit	a05f22eaf393177b94432431c145cbc5ba10390a (patch)
tree	31a2ae01b520b2578477ea4b6b2febf1d0e6c1e6 /libswscale/x86/scale.asm
parent	2831837182fe26f0a19a4d366f3f0553311f1291 (diff)