From a05f22eaf393177b94432431c145cbc5ba10390a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Thu, 9 Jun 2022 16:57:34 +0200 Subject: swscale/x86/swscale: Remove obsolete and harmful MMX(EXT) functions x64 always has MMX, MMXEXT, SSE and SSE2 and this means that some functions for MMX, MMXEXT, SSE and 3dnow are always overridden by other functions (unless one e.g. explicitly disables SSE2). So given that the only systems that benefit from these functions are truely ancient 32bit x86s they are removed. Moreover, some of the removed code was buggy/not bitexact and lead to failures involving the f32le and f32be versions of gray, gbrp and gbrap on x86-32 when SSE2 was not disabled. See e.g. https://fate.ffmpeg.org/report.cgi?time=20220609221253&slot=x86_32-debian-kfreebsd-gcc-4.4-cpuflags-mmx Notice that yuv2yuvX_mmx is not removed, because it is used by SSE3 and AVX2 as fallback in case of unaligned data and also for tail processing. I don't know why yuv2yuvX_mmxext isn't being used for this; an earlier version [1] of 554c2bc7086f49ef5a6a989ad6bc4bc11807eb6f used it, but the version that was eventually applied does not. [1]: https://ffmpeg.org/pipermail/ffmpeg-devel/2020-November/272124.html Signed-off-by: Andreas Rheinhardt --- libswscale/x86/input.asm | 82 +++++--------------------------------- libswscale/x86/output.asm | 30 ++------------ libswscale/x86/scale.asm | 45 ++++----------------- libswscale/x86/swscale.c | 83 +++++---------------------------------- libswscale/x86/swscale_template.c | 30 ++++---------- 5 files changed, 38 insertions(+), 232 deletions(-) (limited to 'libswscale') diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm index fcdfe2fcd8..6de6733faa 100644 --- a/libswscale/x86/input.asm +++ b/libswscale/x86/input.asm @@ -133,23 +133,18 @@ SECTION .text ; %2 = rgb or bgr %macro RGB24_TO_Y_FN 2-3 cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table -%if mmsize == 8 - mova m5, [%2_Ycoeff_12x4] - mova m6, [%2_Ycoeff_3x56] -%define coeff1 m5 -%define coeff2 m6 -%elif ARCH_X86_64 +%if ARCH_X86_64 mova m8, [%2_Ycoeff_12x4] mova m9, [%2_Ycoeff_3x56] %define coeff1 m8 %define coeff2 m9 -%else ; x86-32 && mmsize == 16 +%else ; x86-32 %define coeff1 [%2_Ycoeff_12x4] %define coeff2 [%2_Ycoeff_3x56] -%endif ; x86-32/64 && mmsize == 8/16 -%if (ARCH_X86_64 || mmsize == 8) && %0 == 3 +%endif ; x86-32/64 +%if ARCH_X86_64 && %0 == 3 jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body -%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 +%else ; ARCH_X86_64 && %0 == 3 .body: %if cpuflag(ssse3) mova m7, [shuf_rgb_12x4] @@ -184,7 +179,6 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 } movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 } movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 } -%if mmsize == 16 ; i.e. sse2 punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 } punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 } movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 } @@ -193,7 +187,6 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 } punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 } punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 } -%endif ; mmsize == 16 punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } @@ -215,7 +208,7 @@ cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table add wq, mmsize jl .loop REP_RET -%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 +%endif ; ARCH_X86_64 && %0 == 3 %endmacro ; %1 = nr. of XMM registers @@ -275,12 +268,10 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 } movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 } movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 } -%if mmsize == 16 punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 } punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 } movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 } movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 } -%endif ; mmsize == 16 punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 } punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 } %endif ; cpuflag(ssse3) @@ -294,12 +285,10 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } %else ; !cpuflag(ssse3) -%if mmsize == 16 movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 } movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 } punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 } punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 } -%endif ; mmsize == 16 && !cpuflag(ssse3) punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 } punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 } %endif ; cpuflag(ssse3) @@ -320,13 +309,8 @@ cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table psrad m4, 9 packssdw m0, m1 ; (word) { U[0-7] } packssdw m2, m4 ; (word) { V[0-7] } -%if mmsize == 8 mova [dstUq+wq], m0 mova [dstVq+wq], m2 -%else ; mmsize == 16 - mova [dstUq+wq], m0 - mova [dstVq+wq], m2 -%endif ; mmsize == 8/16 add wq, mmsize jl .loop REP_RET @@ -342,11 +326,6 @@ RGB24_TO_UV_FN %2, rgb RGB24_TO_UV_FN %2, bgr, rgb %endmacro -%if ARCH_X86_32 -INIT_MMX mmx -RGB24_FUNCS 0, 0 -%endif - INIT_XMM sse2 RGB24_FUNCS 10, 12 @@ -483,13 +462,8 @@ cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table psrad m1, 9 packssdw m0, m4 ; (word) { U[0-7] } packssdw m2, m1 ; (word) { V[0-7] } -%if mmsize == 8 mova [dstUq+wq], m0 mova [dstVq+wq], m2 -%else ; mmsize == 16 - mova [dstUq+wq], m0 - mova [dstVq+wq], m2 -%endif ; mmsize == 8/16 add wq, mmsize jl .loop sub wq, mmsize - 1 @@ -535,11 +509,6 @@ RGB32_TO_UV_FN %2, a, r, g, b, rgba RGB32_TO_UV_FN %2, a, b, g, r, rgba %endmacro -%if ARCH_X86_32 -INIT_MMX mmx -RGB32_FUNCS 0, 0 -%endif - INIT_XMM sse2 RGB32_FUNCS 8, 12 @@ -588,25 +557,18 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w movsxd wq, wd %endif add dstq, wq -%if mmsize == 16 test srcq, 15 -%endif lea srcq, [srcq+wq*2] %ifidn %2, yuyv pcmpeqb m2, m2 ; (byte) { 0xff } x 16 psrlw m2, 8 ; (word) { 0x00ff } x 8 %endif ; yuyv -%if mmsize == 16 jnz .loop_u_start neg wq LOOP_YUYV_TO_Y a, %2 .loop_u_start: neg wq LOOP_YUYV_TO_Y u, %2 -%else ; mmsize == 8 - neg wq - LOOP_YUYV_TO_Y a, %2 -%endif ; mmsize == 8/16 %endmacro ; %1 = a (aligned) or u (unaligned) @@ -632,16 +594,9 @@ cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 } pand m1, m0, m2 ; (word) { U0, U1, ..., U7 } psrlw m0, 8 ; (word) { V0, V1, ..., V7 } -%if mmsize == 16 packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 } movh [dstUq+wq], m1 movhps [dstVq+wq], m1 -%else ; mmsize == 8 - packuswb m1, m1 ; (byte) { U0, ... U3 } - packuswb m0, m0 ; (byte) { V0, ... V3 } - movh [dstUq+wq], m1 - movh [dstVq+wq], m0 -%endif ; mmsize == 8/16 add wq, mmsize / 2 jl .loop_%1 REP_RET @@ -661,24 +616,24 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w %endif add dstUq, wq add dstVq, wq -%if mmsize == 16 && %0 == 2 +%if %0 == 2 test srcq, 15 %endif lea srcq, [srcq+wq*4] pcmpeqb m2, m2 ; (byte) { 0xff } x 16 psrlw m2, 8 ; (word) { 0x00ff } x 8 ; NOTE: if uyvy+avx, u/a are identical -%if mmsize == 16 && %0 == 2 +%if %0 == 2 jnz .loop_u_start neg wq LOOP_YUYV_TO_UV a, %2 .loop_u_start: neg wq LOOP_YUYV_TO_UV u, %2 -%else ; mmsize == 8 +%else neg wq LOOP_YUYV_TO_UV a, %2 -%endif ; mmsize == 8/16 +%endif %endmacro ; %1 = a (aligned) or u (unaligned) @@ -716,35 +671,18 @@ cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w %endif add dstUq, wq add dstVq, wq -%if mmsize == 16 test srcq, 15 -%endif lea srcq, [srcq+wq*2] pcmpeqb m5, m5 ; (byte) { 0xff } x 16 psrlw m5, 8 ; (word) { 0x00ff } x 8 -%if mmsize == 16 jnz .loop_u_start neg wq LOOP_NVXX_TO_UV a, %2 .loop_u_start: neg wq LOOP_NVXX_TO_UV u, %2 -%else ; mmsize == 8 - neg wq - LOOP_NVXX_TO_UV a, %2 -%endif ; mmsize == 8/16 %endmacro -%if ARCH_X86_32 -INIT_MMX mmx -YUYV_TO_Y_FN 0, yuyv -YUYV_TO_Y_FN 0, uyvy -YUYV_TO_UV_FN 0, yuyv -YUYV_TO_UV_FN 0, uyvy -NVXX_TO_UV_FN 0, nv12 -NVXX_TO_UV_FN 0, nv21 -%endif - INIT_XMM sse2 YUYV_TO_Y_FN 3, yuyv YUYV_TO_Y_FN 2, uyvy diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm index 1e498fddf6..84e94baaf6 100644 --- a/libswscale/x86/output.asm +++ b/libswscale/x86/output.asm @@ -312,11 +312,9 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset %endif ; %1 == 8/9/10/16 %endmacro -%if ARCH_X86_32 +%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 INIT_MMX mmxext yuv2planeX_fn 8, 0, 7 -yuv2planeX_fn 9, 0, 5 -yuv2planeX_fn 10, 0, 5 %endif INIT_XMM sse2 @@ -407,19 +405,11 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset movq m3, [ditherq] ; dither test offsetd, offsetd jz .no_rot -%if mmsize == 16 punpcklqdq m3, m3 -%endif ; mmsize == 16 PALIGNR m3, m3, 3, m2 .no_rot: -%if mmsize == 8 - mova m2, m3 - punpckhbw m3, m4 ; byte->word - punpcklbw m2, m4 ; byte->word -%else punpcklbw m3, m4 mova m2, m3 -%endif %elif %1 == 9 pxor m4, m4 mova m3, [pw_512] @@ -431,36 +421,22 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset %else ; %1 == 16 %if cpuflag(sse4) ; sse4/avx mova m4, [pd_4] -%else ; mmx/sse2 +%else ; sse2 mova m4, [pd_4min0x40000] mova m5, [minshort] -%endif ; mmx/sse2/sse4/avx +%endif ; sse2/sse4/avx %endif ; %1 == .. ; actual pixel scaling -%if mmsize == 8 - yuv2plane1_mainloop %1, a -%else ; mmsize == 16 test dstq, 15 jnz .unaligned yuv2plane1_mainloop %1, a REP_RET .unaligned: yuv2plane1_mainloop %1, u -%endif ; mmsize == 8/16 REP_RET %endmacro -%if ARCH_X86_32 -INIT_MMX mmx -yuv2plane1_fn 8, 0, 5 -yuv2plane1_fn 16, 0, 3 - -INIT_MMX mmxext -yuv2plane1_fn 9, 0, 3 -yuv2plane1_fn 10, 0, 3 -%endif - INIT_XMM sse2 yuv2plane1_fn 8, 5, 5 yuv2plane1_fn 9, 5, 3 diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index 83cabff722..c62ae3dcc2 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -61,13 +61,11 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi %define mov32 mov %endif ; x86-64 %if %2 == 19 -%if mmsize == 8 ; mmx - mova m2, [max_19bit_int] -%elif cpuflag(sse4) +%if cpuflag(sse4) mova m2, [max_19bit_int] %else ; ssse3/sse2 mova m2, [max_19bit_flt] -%endif ; mmx/sse2/ssse3/sse4 +%endif ; sse2/ssse3/sse4 %endif ; %2 == 19 %if %1 == 16 mova m6, [minshort] @@ -144,12 +142,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix) -%if mmsize == 8 ; mmx - movq m4, m0 - punpckldq m0, m1 - punpckhdq m4, m1 - paddd m0, m4 -%elif notcpuflag(ssse3) ; sse2 +%if notcpuflag(ssse3) ; sse2 mova m4, m0 shufps m0, m1, 10001000b shufps m4, m1, 11011101b @@ -159,7 +152,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] -%endif ; mmx/sse2/ssse3/sse4 +%endif ; sse2/ssse3/sse4 %else ; %3 == 8, i.e. filterSize == 8 scaling ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0] @@ -197,14 +190,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}] ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix) -%if mmsize == 8 - paddd m0, m1 - paddd m4, m5 - movq m1, m0 - punpckldq m0, m4 - punpckhdq m1, m4 - paddd m0, m1 -%elif notcpuflag(ssse3) ; sse2 +%if notcpuflag(ssse3) ; sse2 %if %1 == 8 %define mex m6 %else @@ -233,7 +219,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] -%endif ; mmx/sse2/ssse3/sse4 +%endif ; sse2/ssse3/sse4 %endif ; %3 == 4/8 %else ; %3 == X, i.e. any filterSize scaling @@ -274,7 +260,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi mov srcq, srcmemmp .innerloop: - ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5 + ; load 2x8 (sse) source pixels into m0/m1 -> m4/m5 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] %if %1 == 8 @@ -319,12 +305,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi lea filterq, [filterq+(fltsizeq+dlt)*2] -%if mmsize == 8 ; mmx - movq m0, m4 - punpckldq m4, m5 - punpckhdq m0, m5 - paddd m0, m4 -%else ; mmsize == 16 %if notcpuflag(ssse3) ; sse2 mova m1, m4 punpcklqdq m4, m5 @@ -344,7 +324,6 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi phaddd m4, m4 SWAP 0, 4 %endif ; sse2/ssse3/sse4 -%endif ; mmsize == 8/16 %endif ; %3 ==/!= X %if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned @@ -372,7 +351,7 @@ cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsi %endif ; %3 ==/!= X %endif ; %2 == 15/19 %ifnidn %3, X - add wq, (mmsize<use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) { -#if HAVE_MMX_EXTERNAL - if (EXTERNAL_MMX(cpu_flags)) - c->yuv2planeX = yuv2yuvX_mmx; -#endif #if HAVE_MMXEXT_EXTERNAL if (EXTERNAL_MMXEXT(cpu_flags)) c->yuv2planeX = yuv2yuvX_mmxext; @@ -496,6 +469,14 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c) c->yuv2planeX = yuv2yuvX_avx2; #endif } +#if ARCH_X86_32 && !HAVE_ALIGNED_STACK + // The better yuv2planeX_8 functions need aligned stack on x86-32, + // so we use MMXEXT in this case if they are not available. + if (EXTERNAL_MMXEXT(cpu_flags)) { + if (c->dstBpc == 8 && !c->use_mmx_vfilter) + c->yuv2planeX = ff_yuv2planeX_8_mmxext; + } +#endif /* ARCH_X86_32 && !HAVE_ALIGNED_STACK */ #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \ if (c->srcBpc == 8) { \ @@ -519,12 +500,6 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c) ff_hscale16to19_ ## filtersize ## _ ## opt1; \ } \ } while (0) -#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ - switch (filtersize) { \ - case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \ - case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ - default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \ - } #define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \ switch(c->dstBpc){ \ case 16: do_16_case; break; \ @@ -546,46 +521,6 @@ switch(c->dstBpc){ \ if (!c->chrSrcHSubSample) \ c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \ break -#if ARCH_X86_32 - if (EXTERNAL_MMX(cpu_flags)) { - ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); - ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); - ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT); - - switch (c->srcFormat) { - case AV_PIX_FMT_YA8: - c->lumToYV12 = ff_yuyvToY_mmx; - if (c->needAlpha) - c->alpToYV12 = ff_uyvyToY_mmx; - break; - case AV_PIX_FMT_YUYV422: - c->lumToYV12 = ff_yuyvToY_mmx; - c->chrToYV12 = ff_yuyvToUV_mmx; - break; - case AV_PIX_FMT_UYVY422: - c->lumToYV12 = ff_uyvyToY_mmx; - c->chrToYV12 = ff_uyvyToUV_mmx; - break; - case AV_PIX_FMT_NV12: - c->chrToYV12 = ff_nv12ToUV_mmx; - break; - case AV_PIX_FMT_NV21: - c->chrToYV12 = ff_nv21ToUV_mmx; - break; - case_rgb(rgb24, RGB24, mmx); - case_rgb(bgr24, BGR24, mmx); - case_rgb(bgra, BGRA, mmx); - case_rgb(rgba, RGBA, mmx); - case_rgb(abgr, ABGR, mmx); - case_rgb(argb, ARGB, mmx); - default: - break; - } - } - if (EXTERNAL_MMXEXT(cpu_flags)) { - ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1); - } -#endif /* ARCH_X86_32 */ #define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ switch (filtersize) { \ case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \ diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 97d8cae613..6190fcb4fe 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -29,13 +29,8 @@ #undef PREFETCH -#if COMPILE_TEMPLATE_MMXEXT #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" #define MOVNTQ2 "movntq " -#else -#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" -#define MOVNTQ2 "movq " -#endif #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) #define YSCALEYUV2PACKEDX_UV \ @@ -600,13 +595,8 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" -#if COMPILE_TEMPLATE_MMXEXT #undef WRITEBGR24 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) -#else -#undef WRITEBGR24 -#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) -#endif #if HAVE_6REGS static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, @@ -1478,17 +1468,13 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } if (c->srcBpc == 8 && c->dstBpc <= 14) { - // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). -#if COMPILE_TEMPLATE_MMXEXT - if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { - c->hyscale_fast = ff_hyscale_fast_mmxext; - c->hcscale_fast = ff_hcscale_fast_mmxext; - } else { -#endif /* COMPILE_TEMPLATE_MMXEXT */ - c->hyscale_fast = NULL; - c->hcscale_fast = NULL; -#if COMPILE_TEMPLATE_MMXEXT - } -#endif /* COMPILE_TEMPLATE_MMXEXT */ + // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). + if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { + c->hyscale_fast = ff_hyscale_fast_mmxext; + c->hcscale_fast = ff_hcscale_fast_mmxext; + } else { + c->hyscale_fast = NULL; + c->hcscale_fast = NULL; + } } } -- cgit v1.2.3