summaryrefslogtreecommitdiff
path: root/libswscale/x86/scale.asm
diff options
context:
space:
mode:
authorAndreas Rheinhardt <andreas.rheinhardt@outlook.com>2022-06-09 16:57:34 +0200
committerAndreas Rheinhardt <andreas.rheinhardt@outlook.com>2022-06-22 13:36:04 +0200
commita05f22eaf393177b94432431c145cbc5ba10390a (patch)
tree31a2ae01b520b2578477ea4b6b2febf1d0e6c1e6 /libswscale/x86/scale.asm
parent2831837182fe26f0a19a4d366f3f0553311f1291 (diff)
swscale/x86/swscale: Remove obsolete and harmful MMX(EXT) functions
x64 always has MMX, MMXEXT, SSE and SSE2 and this means that some functions for MMX, MMXEXT, SSE and 3dnow are always overridden by other functions (unless one e.g. explicitly disables SSE2). So given that the only systems that benefit from these functions are truely ancient 32bit x86s they are removed. Moreover, some of the removed code was buggy/not bitexact and lead to failures involving the f32le and f32be versions of gray, gbrp and gbrap on x86-32 when SSE2 was not disabled. See e.g. https://fate.ffmpeg.org/report.cgi?time=20220609221253&slot=x86_32-debian-kfreebsd-gcc-4.4-cpuflags-mmx Notice that yuv2yuvX_mmx is not removed, because it is used by SSE3 and AVX2 as fallback in case of unaligned data and also for tail processing. I don't know why yuv2yuvX_mmxext isn't being used for this; an earlier version [1] of 554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f used it, but the version that was eventually applied does not. [1]: https://ffmpeg.org/pipermail/ffmpeg-devel/2020-November/272124.html Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
Diffstat (limited to 'libswscale/x86/scale.asm')
-rw-r--r--libswscale/x86/scale.asm45
1 files changed, 8 insertions, 37 deletions
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index 83cabff722..c62ae3dcc2 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -61,13 +61,11 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
%define mov32 mov
%endif ; x86-64
%if %2 == 19
-%if mmsize == 8 ; mmx
- mova m2, [max_19bit_int]
-%elif cpuflag(sse4)
+%if cpuflag(sse4)
mova m2, [max_19bit_int]
%else ; ssse3/sse2
mova m2, [max_19bit_flt]
-%endif ; mmx/sse2/ssse3/sse4
+%endif ; sse2/ssse3/sse4
%endif ; %2 == 19
%if %1 == 16
mova m6, [minshort]
@@ -144,12 +142,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
-%if mmsize == 8 ; mmx
- movq m4, m0
- punpckldq m0, m1
- punpckhdq m4, m1
- paddd m0, m4
-%elif notcpuflag(ssse3) ; sse2
+%if notcpuflag(ssse3) ; sse2
mova m4, m0
shufps m0, m1, 10001000b
shufps m4, m1, 11011101b
@@ -159,7 +152,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
-%endif ; mmx/sse2/ssse3/sse4
+%endif ; sse2/ssse3/sse4
%else ; %3 == 8, i.e. filterSize == 8 scaling
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
@@ -197,14 +190,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
-%if mmsize == 8
- paddd m0, m1
- paddd m4, m5
- movq m1, m0
- punpckldq m0, m4
- punpckhdq m1, m4
- paddd m0, m1
-%elif notcpuflag(ssse3) ; sse2
+%if notcpuflag(ssse3) ; sse2
%if %1 == 8
%define mex m6
%else
@@ -233,7 +219,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
-%endif ; mmx/sse2/ssse3/sse4
+%endif ; sse2/ssse3/sse4
%endif ; %3 == 4/8
%else ; %3 == X, i.e. any filterSize scaling
@@ -274,7 +260,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
mov srcq, srcmemmp
.innerloop:
- ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
+ ; load 2x8 (sse) source pixels into m0/m1 -> m4/m5
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
%if %1 == 8
@@ -319,12 +305,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
lea filterq, [filterq+(fltsizeq+dlt)*2]
-%if mmsize == 8 ; mmx
- movq m0, m4
- punpckldq m4, m5
- punpckhdq m0, m5
- paddd m0, m4
-%else ; mmsize == 16
%if notcpuflag(ssse3) ; sse2
mova m1, m4
punpcklqdq m4, m5
@@ -344,7 +324,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
phaddd m4, m4
SWAP 0, 4
%endif ; sse2/ssse3/sse4
-%endif ; mmsize == 8/16
%endif ; %3 ==/!= X
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
@@ -372,7 +351,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
%endif ; %3 ==/!= X
%endif ; %2 == 15/19
%ifnidn %3, X
- add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
+ add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels
; per iteration. see "shl wq,1" above as for why we do this
%else ; %3 == X
add wq, 2
@@ -385,12 +364,8 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi
%macro SCALE_FUNCS 3
SCALE_FUNC %1, %2, 4, 4, 6, %3
SCALE_FUNC %1, %2, 8, 8, 6, %3
-%if mmsize == 8
-SCALE_FUNC %1, %2, X, X, 7, %3
-%else
SCALE_FUNC %1, %2, X, X4, 7, %3
SCALE_FUNC %1, %2, X, X8, 7, %3
-%endif
%endmacro
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
@@ -411,10 +386,6 @@ SCALE_FUNCS 14, 19, %2
SCALE_FUNCS 16, 19, %3
%endmacro
-%if ARCH_X86_32
-INIT_MMX mmx
-SCALE_FUNCS2 0, 0, 0
-%endif
INIT_XMM sse2
SCALE_FUNCS2 7, 6, 8
INIT_XMM ssse3