diff options
Diffstat (limited to 'libswscale/x86')
-rw-r--r-- | libswscale/x86/Makefile | 4 | ||||
-rw-r--r-- | libswscale/x86/hscale_fast_bilinear_simd.c | 359 | ||||
-rw-r--r-- | libswscale/x86/input.asm | 252 | ||||
-rw-r--r-- | libswscale/x86/output.asm | 12 | ||||
-rw-r--r-- | libswscale/x86/rgb2rgb.c | 17 | ||||
-rw-r--r-- | libswscale/x86/rgb2rgb_template.c | 234 | ||||
-rw-r--r-- | libswscale/x86/scale.asm | 14 | ||||
-rw-r--r-- | libswscale/x86/swscale.c | 178 | ||||
-rw-r--r-- | libswscale/x86/swscale_template.c | 450 | ||||
-rw-r--r-- | libswscale/x86/w64xmmtest.c | 8 | ||||
-rw-r--r-- | libswscale/x86/yuv2rgb.c | 25 | ||||
-rw-r--r-- | libswscale/x86/yuv2rgb_template.c | 46 |
12 files changed, 1082 insertions, 517 deletions
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index b94b14abbb..69012077bb 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -1,7 +1,11 @@ +$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS) + OBJS += x86/rgb2rgb.o \ x86/swscale.o \ x86/yuv2rgb.o \ +MMX-OBJS += x86/hscale_fast_bilinear_simd.o \ + OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o YASM-OBJS += x86/input.o \ diff --git a/libswscale/x86/hscale_fast_bilinear_simd.c b/libswscale/x86/hscale_fast_bilinear_simd.c new file mode 100644 index 0000000000..b37b63c3ec --- /dev/null +++ b/libswscale/x86/hscale_fast_bilinear_simd.c @@ -0,0 +1,359 @@ +/* + * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "../swscale_internal.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" + +#define RET 0xC3 // near return opcode for x86 +#define PREFETCH "prefetchnta" + +#if HAVE_INLINE_ASM +av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, + int16_t *filter, int32_t *filterPos, + int numSplits) +{ + uint8_t *fragmentA; + x86_reg imm8OfPShufW1A; + x86_reg imm8OfPShufW2A; + x86_reg fragmentLengthA; + uint8_t *fragmentB; + x86_reg imm8OfPShufW1B; + x86_reg imm8OfPShufW2B; + x86_reg fragmentLengthB; + int fragmentPos; + + int xpos, i; + + // create an optimized horizontal scaling routine + /* This scaler is made of runtime-generated MMXEXT code using specially tuned + * pshufw instructions. For every four output pixels, if four input pixels + * are enough for the fast bilinear scaling, then a chunk of fragmentB is + * used. If five input pixels are needed, then a chunk of fragmentA is used. + */ + + // code fragment + + __asm__ volatile ( + "jmp 9f \n\t" + // Begin + "0: \n\t" + "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" + "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" + "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "pshufw $0xFF, %%mm1, %%mm1 \n\t" + "1: \n\t" + "pshufw $0xFF, %%mm0, %%mm0 \n\t" + "2: \n\t" + "psubw %%mm1, %%mm0 \n\t" + "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" + "pmullw %%mm3, %%mm0 \n\t" + "psllw $7, %%mm1 \n\t" + "paddw %%mm1, %%mm0 \n\t" + + "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" + + "add $8, %%"REG_a" \n\t" + // End + "9: \n\t" + "lea " LOCAL_MANGLE(0b) ", %0 \n\t" + "lea " LOCAL_MANGLE(1b) ", %1 \n\t" + "lea " LOCAL_MANGLE(2b) ", %2 \n\t" + "dec %1 \n\t" + "dec %2 \n\t" + "sub %0, %1 \n\t" + "sub %0, %2 \n\t" + "lea " LOCAL_MANGLE(9b) ", %3 \n\t" + "sub %0, %3 \n\t" + + + : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), + "=r" (fragmentLengthA) + ); + + __asm__ volatile ( + "jmp 9f \n\t" + // Begin + "0: \n\t" + "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" + "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "pshufw $0xFF, %%mm0, %%mm1 \n\t" + "1: \n\t" + "pshufw $0xFF, %%mm0, %%mm0 \n\t" + "2: \n\t" + "psubw %%mm1, %%mm0 \n\t" + "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" + "pmullw %%mm3, %%mm0 \n\t" + "psllw $7, %%mm1 \n\t" + "paddw %%mm1, %%mm0 \n\t" + + "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" + + "add $8, %%"REG_a" \n\t" + // End + "9: \n\t" + "lea " LOCAL_MANGLE(0b) ", %0 \n\t" + "lea " LOCAL_MANGLE(1b) ", %1 \n\t" + "lea " LOCAL_MANGLE(2b) ", %2 \n\t" + "dec %1 \n\t" + "dec %2 \n\t" + "sub %0, %1 \n\t" + "sub %0, %2 \n\t" + "lea " LOCAL_MANGLE(9b) ", %3 \n\t" + "sub %0, %3 \n\t" + + + : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), + "=r" (fragmentLengthB) + ); + + xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers + fragmentPos = 0; + + for (i = 0; i < dstW / numSplits; i++) { + int xx = xpos >> 16; + + if ((i & 3) == 0) { + int a = 0; + int b = ((xpos + xInc) >> 16) - xx; + int c = ((xpos + xInc * 2) >> 16) - xx; + int d = ((xpos + xInc * 3) >> 16) - xx; + int inc = (d + 1 < 4); + uint8_t *fragment = inc ? fragmentB : fragmentA; + x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A; + x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A; + x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA; + int maxShift = 3 - (d + inc); + int shift = 0; + + if (filterCode) { + filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9; + filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9; + filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9; + filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9; + filterPos[i / 2] = xx; + + memcpy(filterCode + fragmentPos, fragment, fragmentLength); + + filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) | + ((b + inc) << 2) | + ((c + inc) << 4) | + ((d + inc) << 6); + filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) | + (c << 4) | + (d << 6); + + if (i + 4 - inc >= dstW) + shift = maxShift; // avoid overread + else if ((filterPos[i / 2] & 3) <= maxShift) + shift = filterPos[i / 2] & 3; // align + + if (shift && i >= shift) { + filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift; + filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift; + filterPos[i / 2] -= shift; + } + } + + fragmentPos += fragmentLength; + + if (filterCode) + filterCode[fragmentPos] = RET; + } + xpos += xInc; + } + if (filterCode) + filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part + + return fragmentPos + 1; +} + +void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, + int dstWidth, const uint8_t *src, + int srcW, int xInc) +{ + int32_t *filterPos = c->hLumFilterPos; + int16_t *filter = c->hLumFilter; + void *mmxextFilterCode = c->lumMmxextFilterCode; + int i; +#if ARCH_X86_64 + uint64_t retsave; +#else +#if defined(PIC) + uint64_t ebxsave; +#endif +#endif + + __asm__ volatile( +#if ARCH_X86_64 + "mov -8(%%rsp), %%"REG_a" \n\t" + "mov %%"REG_a", %5 \n\t" // retsave +#else +#if defined(PIC) + "mov %%"REG_b", %5 \n\t" // ebxsave +#endif +#endif + "pxor %%mm7, %%mm7 \n\t" + "mov %0, %%"REG_c" \n\t" + "mov %1, %%"REG_D" \n\t" + "mov %2, %%"REG_d" \n\t" + "mov %3, %%"REG_b" \n\t" + "xor %%"REG_a", %%"REG_a" \n\t" // i + PREFETCH" (%%"REG_c") \n\t" + PREFETCH" 32(%%"REG_c") \n\t" + PREFETCH" 64(%%"REG_c") \n\t" + +#if ARCH_X86_64 +#define CALL_MMXEXT_FILTER_CODE \ + "movl (%%"REG_b"), %%esi \n\t"\ + "call *%4 \n\t"\ + "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ + "add %%"REG_S", %%"REG_c" \n\t"\ + "add %%"REG_a", %%"REG_D" \n\t"\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + +#else +#define CALL_MMXEXT_FILTER_CODE \ + "movl (%%"REG_b"), %%esi \n\t"\ + "call *%4 \n\t"\ + "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ + "add %%"REG_a", %%"REG_D" \n\t"\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + +#endif /* ARCH_X86_64 */ + + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + +#if ARCH_X86_64 + "mov %5, %%"REG_a" \n\t" + "mov %%"REG_a", -8(%%rsp) \n\t" +#else +#if defined(PIC) + "mov %5, %%"REG_b" \n\t" +#endif +#endif + :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), + "m" (mmxextFilterCode) +#if ARCH_X86_64 + ,"m"(retsave) +#else +#if defined(PIC) + ,"m" (ebxsave) +#endif +#endif + : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D +#if ARCH_X86_64 || !defined(PIC) + ,"%"REG_b +#endif + ); + + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) + dst[i] = src[srcW-1]*128; +} + +void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, + int dstWidth, const uint8_t *src1, + const uint8_t *src2, int srcW, int xInc) +{ + int32_t *filterPos = c->hChrFilterPos; + int16_t *filter = c->hChrFilter; + void *mmxextFilterCode = c->chrMmxextFilterCode; + int i; +#if ARCH_X86_64 + DECLARE_ALIGNED(8, uint64_t, retsave); +#else +#if defined(PIC) + DECLARE_ALIGNED(8, uint64_t, ebxsave); +#endif +#endif + __asm__ volatile( +#if ARCH_X86_64 + "mov -8(%%rsp), %%"REG_a" \n\t" + "mov %%"REG_a", %7 \n\t" // retsave +#else +#if defined(PIC) + "mov %%"REG_b", %7 \n\t" // ebxsave +#endif +#endif + "pxor %%mm7, %%mm7 \n\t" + "mov %0, %%"REG_c" \n\t" + "mov %1, %%"REG_D" \n\t" + "mov %2, %%"REG_d" \n\t" + "mov %3, %%"REG_b" \n\t" + "xor %%"REG_a", %%"REG_a" \n\t" // i + PREFETCH" (%%"REG_c") \n\t" + PREFETCH" 32(%%"REG_c") \n\t" + PREFETCH" 64(%%"REG_c") \n\t" + + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + "xor %%"REG_a", %%"REG_a" \n\t" // i + "mov %5, %%"REG_c" \n\t" // src2 + "mov %6, %%"REG_D" \n\t" // dst2 + PREFETCH" (%%"REG_c") \n\t" + PREFETCH" 32(%%"REG_c") \n\t" + PREFETCH" 64(%%"REG_c") \n\t" + + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + CALL_MMXEXT_FILTER_CODE + +#if ARCH_X86_64 + "mov %7, %%"REG_a" \n\t" + "mov %%"REG_a", -8(%%rsp) \n\t" +#else +#if defined(PIC) + "mov %7, %%"REG_b" \n\t" +#endif +#endif + :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), + "m" (mmxextFilterCode), "m" (src2), "m"(dst2) +#if ARCH_X86_64 + ,"m"(retsave) +#else +#if defined(PIC) + ,"m" (ebxsave) +#endif +#endif + : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D +#if ARCH_X86_64 || !defined(PIC) + ,"%"REG_b +#endif + ); + + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { + dst1[i] = src1[srcW-1]*128; + dst2[i] = src2[srcW-1]*128; + } +} +#endif //HAVE_INLINE_ASM diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm index 6f5677e1fd..af9afcaa53 100644 --- a/libswscale/x86/input.asm +++ b/libswscale/x86/input.asm @@ -4,20 +4,20 @@ ;* into YUV planes also. ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -35,33 +35,59 @@ SECTION_RODATA %define GV 0xD0E3 %define BV 0xF6E4 -rgb_Yrnd: times 4 dd 0x84000 ; 16.5 << 15 -rgb_UVrnd: times 4 dd 0x404000 ; 128.5 << 15 -bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY -bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY -rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY -rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY -bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU -bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU -rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU -rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU -bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV -bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV -rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV -rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV - -rgba_Ycoeff_rb: times 4 dw RY, BY -rgba_Ycoeff_br: times 4 dw BY, RY -rgba_Ycoeff_ga: times 4 dw GY, 0 -rgba_Ycoeff_ag: times 4 dw 0, GY -rgba_Ucoeff_rb: times 4 dw RU, BU -rgba_Ucoeff_br: times 4 dw BU, RU -rgba_Ucoeff_ga: times 4 dw GU, 0 -rgba_Ucoeff_ag: times 4 dw 0, GU -rgba_Vcoeff_rb: times 4 dw RV, BV -rgba_Vcoeff_br: times 4 dw BV, RV -rgba_Vcoeff_ga: times 4 dw GV, 0 -rgba_Vcoeff_ag: times 4 dw 0, GV +rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15 +rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15 +%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq +%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq +%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq +%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq +%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq +%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq +%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq +%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq +%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq +%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq +%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq +%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq + +%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq +%define rgba_Ycoeff_br 16*4 + 16*13 + tableq +%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq +%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq +%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq +%define rgba_Ucoeff_br 16*4 + 16*17 + tableq +%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq +%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq +%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq +%define rgba_Vcoeff_br 16*4 + 16*21 + tableq +%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq +%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq + +; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY +; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY +; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY +; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY +; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU +; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU +; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU +; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU +; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV +; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV +; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV +; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV + +; rgba_Ycoeff_rb: times 4 dw RY, BY +; rgba_Ycoeff_br: times 4 dw BY, RY +; rgba_Ycoeff_ga: times 4 dw GY, 0 +; rgba_Ycoeff_ag: times 4 dw 0, GY +; rgba_Ucoeff_rb: times 4 dw RU, BU +; rgba_Ucoeff_br: times 4 dw BU, RU +; rgba_Ucoeff_ga: times 4 dw GU, 0 +; rgba_Ucoeff_ag: times 4 dw 0, GU +; rgba_Vcoeff_rb: times 4 dw RV, BV +; rgba_Vcoeff_br: times 4 dw BV, RV +; rgba_Vcoeff_ga: times 4 dw GV, 0 +; rgba_Vcoeff_ag: times 4 dw 0, GV shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \ 6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80 @@ -82,7 +108,7 @@ SECTION .text ; %1 = nr. of XMM registers ; %2 = rgb or bgr %macro RGB24_TO_Y_FN 2-3 -cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w +cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table %if mmsize == 8 mova m5, [%2_Ycoeff_12x4] mova m6, [%2_Ycoeff_3x56] @@ -114,6 +140,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w %if ARCH_X86_64 movsxd wq, wd %endif + add wq, wq add dstq, wq neg wq %if notcpuflag(ssse3) @@ -157,12 +184,11 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7] paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] } paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] } - psrad m0, 15 - psrad m2, 15 + psrad m0, 9 + psrad m2, 9 packssdw m0, m2 ; (word) { Y[0-7] } - packuswb m0, m0 ; (byte) { Y[0-7] } - movh [dstq+wq], m0 - add wq, mmsize / 2 + mova [dstq+wq], m0 + add wq, mmsize jl .loop REP_RET %endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8 @@ -171,7 +197,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w ; %1 = nr. of XMM registers ; %2 = rgb or bgr %macro RGB24_TO_UV_FN 2-3 -cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table %if ARCH_X86_64 mova m8, [%2_Ucoeff_12x4] mova m9, [%2_Ucoeff_3x56] @@ -202,10 +228,11 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w %endif ; x86-32/64 %endif ; cpuflag(ssse3) %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif + add wq, wq add dstUq, wq add dstVq, wq neg wq @@ -263,23 +290,20 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] } paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] } paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] } - psrad m0, 15 - psrad m2, 15 - psrad m1, 15 - psrad m4, 15 + psrad m0, 9 + psrad m2, 9 + psrad m1, 9 + psrad m4, 9 packssdw m0, m1 ; (word) { U[0-7] } packssdw m2, m4 ; (word) { V[0-7] } %if mmsize == 8 - packuswb m0, m0 ; (byte) { U[0-3] } - packuswb m2, m2 ; (byte) { V[0-3] } - movh [dstUq+wq], m0 - movh [dstVq+wq], m2 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %else ; mmsize == 16 - packuswb m0, m2 ; (byte) { U[0-7], V[0-7] } - movh [dstUq+wq], m0 - movhps [dstVq+wq], m0 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %endif ; mmsize == 8/16 - add wq, mmsize / 2 + add wq, mmsize jl .loop REP_RET %endif ; ARCH_X86_64 && %0 == 3 @@ -305,13 +329,15 @@ RGB24_FUNCS 10, 12 INIT_XMM ssse3 RGB24_FUNCS 11, 13 +%if HAVE_AVX_EXTERNAL INIT_XMM avx RGB24_FUNCS 11, 13 +%endif ; %1 = nr. of XMM registers ; %2-5 = rgba, bgra, argb or abgr (in individual characters) %macro RGB32_TO_Y_FN 5-6 -cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w +cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table mova m5, [rgba_Ycoeff_%2%4] mova m6, [rgba_Ycoeff_%3%5] %if %0 == 6 @@ -321,7 +347,9 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w %if ARCH_X86_64 movsxd wq, wd %endif - lea srcq, [srcq+wq*4] + add wq, wq + sub wq, mmsize - 1 + lea srcq, [srcq+wq*2] add dstq, wq neg wq mova m4, [rgb_Yrnd] @@ -329,8 +357,8 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w psrlw m7, 8 ; (word) { 0x00ff } x4 .loop: ; FIXME check alignment and use mova - movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] - movu m2, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] @@ -340,13 +368,29 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w paddd m2, m4 ; += rgb_Yrnd paddd m0, m1 ; (dword) { Y[0-3] } paddd m2, m3 ; (dword) { Y[4-7] } - psrad m0, 15 - psrad m2, 15 + psrad m0, 9 + psrad m2, 9 packssdw m0, m2 ; (word) { Y[0-7] } - packuswb m0, m0 ; (byte) { Y[0-7] } - movh [dstq+wq], m0 - add wq, mmsize / 2 + mova [dstq+wq], m0 + add wq, mmsize jl .loop + sub wq, mmsize - 1 + jz .end + add srcq, 2*mmsize - 2 + add dstq, mmsize - 1 +.loop2: + movd m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7] + pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3] + pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3] + paddd m0, m4 ; += rgb_Yrnd + paddd m0, m1 ; (dword) { Y[0-3] } + psrad m0, 9 + packssdw m0, m0 ; (word) { Y[0-7] } + movd [dstq+wq], m0 + add wq, 2 + jl .loop2 +.end: REP_RET %endif ; %0 == 3 %endmacro @@ -354,7 +398,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w ; %1 = nr. of XMM registers ; %2-5 = rgba, bgra, argb or abgr (in individual characters) %macro RGB32_TO_UV_FN 5-6 -cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table %if ARCH_X86_64 mova m8, [rgba_Ucoeff_%2%4] mova m9, [rgba_Ucoeff_%3%5] @@ -375,21 +419,23 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w %else ; ARCH_X86_64 && %0 == 6 .body: %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif + add wq, wq + sub wq, mmsize - 1 add dstUq, wq add dstVq, wq - lea srcq, [srcq+wq*4] + lea srcq, [srcq+wq*2] neg wq pcmpeqb m7, m7 psrlw m7, 8 ; (word) { 0x00ff } x4 mova m6, [rgb_UVrnd] .loop: ; FIXME check alignment and use mova - movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] - movu m4, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] + movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3] + movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7] DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] @@ -405,26 +451,48 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7] paddd m3, m6 ; += rgb_UVrnd paddd m5, m6 ; += rgb_UVrnd - psrad m0, 15 + psrad m0, 9 paddd m1, m3 ; (dword) { V[4-7] } paddd m4, m5 ; (dword) { U[4-7] } - psrad m2, 15 - psrad m4, 15 - psrad m1, 15 + psrad m2, 9 + psrad m4, 9 + psrad m1, 9 packssdw m0, m4 ; (word) { U[0-7] } packssdw m2, m1 ; (word) { V[0-7] } %if mmsize == 8 - packuswb m0, m0 ; (byte) { U[0-7] } - packuswb m2, m2 ; (byte) { V[0-7] } - movh [dstUq+wq], m0 - movh [dstVq+wq], m2 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %else ; mmsize == 16 - packuswb m0, m2 ; (byte) { U[0-7], V[0-7] } - movh [dstUq+wq], m0 - movhps [dstVq+wq], m0 + mova [dstUq+wq], m0 + mova [dstVq+wq], m2 %endif ; mmsize == 8/16 - add wq, mmsize / 2 + add wq, mmsize jl .loop + sub wq, mmsize - 1 + jz .end + add srcq , 2*mmsize - 2 + add dstUq, mmsize - 1 + add dstVq, mmsize - 1 +.loop2: + movd m0, [srcq+wq*2] ; (byte) { Bx, Gx, Rx, xx }[0-3] + DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7] + pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3] + pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3] + pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3] + pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3] + paddd m3, m6 ; += rgb_UVrnd + paddd m1, m6 ; += rgb_UVrnd + paddd m2, m3 ; (dword) { V[0-3] } + paddd m0, m1 ; (dword) { U[0-3] } + psrad m0, 9 + psrad m2, 9 + packssdw m0, m0 ; (word) { U[0-7] } + packssdw m2, m2 ; (word) { V[0-7] } + movd [dstUq+wq], m0 + movd [dstVq+wq], m2 + add wq, 2 + jl .loop2 +.end: REP_RET %endif ; ARCH_X86_64 && %0 == 3 %endmacro @@ -451,8 +519,10 @@ RGB32_FUNCS 0, 0 INIT_XMM sse2 RGB32_FUNCS 8, 12 +%if HAVE_AVX_EXTERNAL INIT_XMM avx RGB32_FUNCS 8, 12 +%endif ;----------------------------------------------------------------------------- ; YUYV/UYVY/NV12/NV21 packed pixel shuffling. @@ -489,7 +559,7 @@ RGB32_FUNCS 8, 12 ; will be the same (i.e. YUYV+AVX), and thus we don't need to ; split the loop in an aligned and unaligned case %macro YUYV_TO_Y_FN 2-3 -cglobal %2ToY, 3, 3, %1, dst, src, w +cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w %if ARCH_X86_64 movsxd wq, wd %endif @@ -559,11 +629,11 @@ cglobal %2ToY, 3, 3, %1, dst, src, w ; will be the same (i.e. UYVY+AVX), and thus we don't need to ; split the loop in an aligned and unaligned case %macro YUYV_TO_UV_FN 2-3 -cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif add dstUq, wq add dstVq, wq @@ -593,8 +663,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w .loop_%1: mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... } mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... } - pand m2, m0, m4 ; (word) { U0, U1, ..., U7 } - pand m3, m1, m4 ; (word) { U8, U9, ..., U15 } + pand m2, m0, m5 ; (word) { U0, U1, ..., U7 } + pand m3, m1, m5 ; (word) { U8, U9, ..., U15 } psrlw m0, 8 ; (word) { V0, V1, ..., V7 } psrlw m1, 8 ; (word) { V8, V9, ..., V15 } packuswb m2, m3 ; (byte) { U0, ..., U15 } @@ -614,11 +684,11 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w ; %1 = nr. of XMM registers ; %2 = nv12 or nv21 %macro NVXX_TO_UV_FN 2 -cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w +cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w %if ARCH_X86_64 - movsxd wq, dword r4m + movsxd wq, dword r5m %else ; x86-32 - mov wq, r4m + mov wq, r5m %endif add dstUq, wq add dstVq, wq @@ -626,8 +696,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w test srcq, 15 %endif lea srcq, [srcq+wq*2] - pcmpeqb m4, m4 ; (byte) { 0xff } x 16 - psrlw m4, 8 ; (word) { 0x00ff } x 8 + pcmpeqb m5, m5 ; (byte) { 0xff } x 16 + psrlw m5, 8 ; (word) { 0x00ff } x 8 %if mmsize == 16 jnz .loop_u_start neg wq @@ -659,6 +729,7 @@ YUYV_TO_UV_FN 3, uyvy NVXX_TO_UV_FN 5, nv12 NVXX_TO_UV_FN 5, nv21 +%if HAVE_AVX_EXTERNAL INIT_XMM avx ; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but ; that's not faster in practice @@ -666,3 +737,4 @@ YUYV_TO_UV_FN 3, yuyv YUYV_TO_UV_FN 3, uyvy, 1 NVXX_TO_UV_FN 5, nv12 NVXX_TO_UV_FN 5, nv21 +%endif diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm index e1ceded756..9ea4af9535 100644 --- a/libswscale/x86/output.asm +++ b/libswscale/x86/output.asm @@ -3,20 +3,20 @@ ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> ;* Kieran Kunhya <kieran@kunhya.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -264,10 +264,12 @@ yuv2planeX_fn 9, 7, 5 yuv2planeX_fn 10, 7, 5 yuv2planeX_fn 16, 8, 5 +%if HAVE_AVX_EXTERNAL INIT_XMM avx yuv2planeX_fn 8, 10, 7 yuv2planeX_fn 9, 7, 5 yuv2planeX_fn 10, 7, 5 +%endif ; %1=outout-bpc, %2=alignment (u/a) %macro yuv2plane1_mainloop 2 @@ -402,8 +404,10 @@ yuv2plane1_fn 16, 6, 3 INIT_XMM sse4 yuv2plane1_fn 16, 5, 3 +%if HAVE_AVX_EXTERNAL INIT_XMM avx yuv2plane1_fn 8, 5, 5 yuv2plane1_fn 9, 5, 3 yuv2plane1_fn 10, 5, 3 yuv2plane1_fn 16, 5, 3 +%endif diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 9cfe831e3c..b80e869e0c 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -6,20 +6,20 @@ * Written by Nick Kurshev. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -72,8 +72,14 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL; DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL; DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL; DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; +DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL; +DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL; +DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL; + +DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset); +DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111); +DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset); -#define RGB2YUV_SHIFT 8 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5)) #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5)) #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) @@ -125,6 +131,7 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; #undef COMPILE_TEMPLATE_AMD3DNOW #define COMPILE_TEMPLATE_MMXEXT 0 #define COMPILE_TEMPLATE_SSE2 0 +#define COMPILE_TEMPLATE_AVX 0 #define COMPILE_TEMPLATE_AMD3DNOW 1 #define RENAME(a) a ## _3dnow #include "rgb2rgb_template.c" diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c index 5d34c21711..e97ba4fe82 100644 --- a/libswscale/x86/rgb2rgb_template.c +++ b/libswscale/x86/rgb2rgb_template.c @@ -7,20 +7,20 @@ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * lot of big-endian byte order fixes by Alex Beregszaszi * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -131,14 +131,11 @@ static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int sr "movq %%mm4, %%mm3 \n\t" \ "psllq $48, %%mm2 \n\t" \ "psllq $32, %%mm3 \n\t" \ - "pand "MANGLE(mask24hh)", %%mm2\n\t" \ - "pand "MANGLE(mask24hhh)", %%mm3\n\t" \ "por %%mm2, %%mm0 \n\t" \ "psrlq $16, %%mm1 \n\t" \ "psrlq $32, %%mm4 \n\t" \ "psllq $16, %%mm5 \n\t" \ "por %%mm3, %%mm1 \n\t" \ - "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \ "por %%mm5, %%mm4 \n\t" \ \ MOVNTQ" %%mm0, (%0) \n\t" \ @@ -168,6 +165,7 @@ static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int sr "movq %%mm5, %%mm7 \n\t" STORE_BGR24_MMX :: "r"(dest), "r"(s) + NAMED_CONSTRAINTS_ADD(mask24l,mask24h) :"memory"); dest += 24; s += 32; @@ -717,27 +715,6 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_s } } -/* - I use less accurate approximation here by simply left-shifting the input - value and filling the low order bits with zeroes. This method improves PNG - compression but this scheme cannot reproduce white exactly, since it does - not generate an all-ones maximum value; the net effect is to darken the - image slightly. - - The better method should be "left bit replication": - - 4 3 2 1 0 - --------- - 1 1 0 1 1 - - 7 6 5 4 3 2 1 0 - ---------------- - 1 1 0 1 1 1 1 0 - |=======| |===| - | leftmost bits repeated to fill open bits - | - original bits -*/ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) { const uint16_t *end; @@ -756,9 +733,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $2, %%mm1 \n\t" - "psrlq $7, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -786,9 +764,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $2, %%mm1 \n\t" - "psrlq $7, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -809,6 +788,7 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr :"=m"(*d) :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) + NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi) :"memory"); /* borrowed 32 to 24 */ __asm__ volatile( @@ -825,6 +805,7 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr STORE_BGR24_MMX :: "r"(d), "m"(*s) + NAMED_CONSTRAINTS_ADD(mask24l,mask24h) :"memory"); d += 24; s += 8; @@ -834,9 +815,9 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x7C00)>>7; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); } } @@ -858,9 +839,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $3, %%mm1 \n\t" - "psrlq $8, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -888,9 +871,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $3, %%mm1 \n\t" - "psrlq $8, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "movq %%mm2, %%mm5 \n\t" @@ -910,6 +895,7 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr "por %%mm5, %%mm3 \n\t" :"=m"(*d) :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) + NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi) :"memory"); /* borrowed 32 to 24 */ __asm__ volatile( @@ -926,6 +912,7 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr STORE_BGR24_MMX :: "r"(d), "m"(*s) + NAMED_CONSTRAINTS_ADD(mask24l,mask24h) :"memory"); d += 24; s += 8; @@ -935,9 +922,9 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0xF800)>>8; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); } } @@ -980,11 +967,13 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $2, %%mm1 \n\t" - "psrlq $7, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "pmulhw %5, %%mm0 \n\t" + "pmulhw %5, %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" PACK_RGB32 - ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) + ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid) + NAMED_CONSTRAINTS_ADD(mul15_hi) :"memory"); d += 16; s += 4; @@ -994,9 +983,9 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x3E0)>>2; - *d++ = (bgr&0x7C00)>>7; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); + *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); *d++ = 255; } } @@ -1021,11 +1010,14 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s "pand %2, %%mm0 \n\t" "pand %3, %%mm1 \n\t" "pand %4, %%mm2 \n\t" - "psllq $3, %%mm0 \n\t" - "psrlq $3, %%mm1 \n\t" - "psrlq $8, %%mm2 \n\t" + "psllq $5, %%mm0 \n\t" + "psrlq $1, %%mm2 \n\t" + "pmulhw %5, %%mm0 \n\t" + "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" + "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" PACK_RGB32 - ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r) + ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid) + NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi) :"memory"); d += 16; s += 4; @@ -1035,9 +1027,9 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s while (s < end) { register uint16_t bgr; bgr = *s++; - *d++ = (bgr&0x1F)<<3; - *d++ = (bgr&0x7E0)>>3; - *d++ = (bgr&0xF800)>>8; + *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); + *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); + *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); *d++ = 255; } } @@ -1098,7 +1090,7 @@ static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) : "memory"); for (; idx<15; idx+=4) { - register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; + register unsigned v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; v &= 0xff00ff; *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); } @@ -1150,6 +1142,7 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int sr "2: \n\t" : "+a" (mmx_size) : "r" (src-mmx_size), "r"(dst-mmx_size) + NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b) ); __asm__ volatile(SFENCE:::"memory"); @@ -1485,6 +1478,7 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWid :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), "g" (-mmxSize) + NAMED_CONSTRAINTS_ADD(mmx_ff) : "%"REG_a ); @@ -1629,18 +1623,33 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t * others are ignored in the C version. * FIXME: Write HQ version. */ +#if HAVE_7REGS static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, - int lumStride, int chromStride, int srcStride) + int lumStride, int chromStride, int srcStride, + int32_t *rgb2yuv) { +#define BGR2Y_IDX "16*4+16*32" +#define BGR2U_IDX "16*4+16*33" +#define BGR2V_IDX "16*4+16*34" int y; const x86_reg chromWidth= width>>1; + + if (height > 2) { + ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv); + src += 2*srcStride; + ydst += 2*lumStride; + udst += chromStride; + vdst += chromStride; + height -= 2; + } + for (y=0; y<height-2; y+=2) { int i; for (i=0; i<2; i++) { __asm__ volatile( "mov %2, %%"REG_a" \n\t" - "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" + "movq "BGR2Y_IDX"(%3), %%mm6 \n\t" "movq "MANGLE(ff_w1111)", %%mm5 \n\t" "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" @@ -1659,12 +1668,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "pmaddwd %%mm6, %%mm1 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" "pmaddwd %%mm6, %%mm3 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm0 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm1, %%mm0 \n\t" "packssdw %%mm3, %%mm2 \n\t" "pmaddwd %%mm5, %%mm0 \n\t" @@ -1684,12 +1691,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "pmaddwd %%mm6, %%mm1 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" "pmaddwd %%mm6, %%mm3 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm4 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm1, %%mm4 \n\t" "packssdw %%mm3, %%mm2 \n\t" "pmaddwd %%mm5, %%mm4 \n\t" @@ -1704,7 +1709,8 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" "add $8, %%"REG_a" \n\t" " js 1b \n\t" - : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width) + : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) + NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset) : "%"REG_a, "%"REG_d ); ydst += lumStride; @@ -1714,7 +1720,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ __asm__ volatile( "mov %4, %%"REG_a" \n\t" "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" + "movq "BGR2U_IDX"(%5), %%mm6 \n\t" "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" "add %%"REG_d", %%"REG_d" \n\t" @@ -1763,19 +1769,17 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "psrlw $2, %%mm0 \n\t" "psrlw $2, %%mm2 \n\t" #endif - "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" - "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" + "movq "BGR2V_IDX"(%5), %%mm1 \n\t" + "movq "BGR2V_IDX"(%5), %%mm3 \n\t" "pmaddwd %%mm0, %%mm1 \n\t" "pmaddwd %%mm2, %%mm3 \n\t" "pmaddwd %%mm6, %%mm0 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm0 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm2, %%mm0 \n\t" "packssdw %%mm3, %%mm1 \n\t" "pmaddwd %%mm5, %%mm0 \n\t" @@ -1825,19 +1829,17 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "psrlw $2, %%mm4 \n\t" "psrlw $2, %%mm2 \n\t" #endif - "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" - "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" + "movq "BGR2V_IDX"(%5), %%mm1 \n\t" + "movq "BGR2V_IDX"(%5), %%mm3 \n\t" "pmaddwd %%mm4, %%mm1 \n\t" "pmaddwd %%mm2, %%mm3 \n\t" "pmaddwd %%mm6, %%mm4 \n\t" "pmaddwd %%mm6, %%mm2 \n\t" -#ifndef FAST_BGR2YV12 "psrad $8, %%mm4 \n\t" "psrad $8, %%mm1 \n\t" "psrad $8, %%mm2 \n\t" "psrad $8, %%mm3 \n\t" -#endif "packssdw %%mm2, %%mm4 \n\t" "packssdw %%mm3, %%mm1 \n\t" "pmaddwd %%mm5, %%mm4 \n\t" @@ -1856,7 +1858,8 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ "movd %%mm0, (%3, %%"REG_a") \n\t" "add $4, %%"REG_a" \n\t" " js 1b \n\t" - : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) + : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) + NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset) : "%"REG_a, "%"REG_d ); @@ -1869,8 +1872,9 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ SFENCE" \n\t" :::"memory"); - rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride); + ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv); } +#endif /* HAVE_7REGS */ #endif /* !COMPILE_TEMPLATE_SSE2 */ #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX @@ -1883,6 +1887,7 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui for (h=0; h < height; h++) { int w; + if (width >= 16) #if COMPILE_TEMPLATE_SSE2 __asm__( "xor %%"REG_a", %%"REG_a" \n\t" @@ -1900,7 +1905,7 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui "cmp %3, %%"REG_a" \n\t" " jb 1b \n\t" ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) - : "memory", "%"REG_a"" + : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"REG_a ); #else __asm__( @@ -1938,16 +1943,22 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui src2 += src2Stride; } __asm__( +#if !COMPILE_TEMPLATE_SSE2 EMMS" \n\t" +#endif SFENCE" \n\t" ::: "memory" ); } #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */ +#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV, - const uint8_t *src, const uint8_t *unused, int w, + const uint8_t *unused, + const uint8_t *src1, + const uint8_t *src2, + int w, uint32_t *unused2); static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, @@ -1956,18 +1967,21 @@ static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t int h; for (h = 0; h < height; h++) { - RENAME(ff_nv12ToUV)(dst1, dst2, src, NULL, width, NULL); + RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL); src += srcStride; dst1 += dst1Stride; dst2 += dst2Stride; } __asm__( +#if !COMPILE_TEMPLATE_SSE2 EMMS" \n\t" +#endif SFENCE" \n\t" ::: "memory" ); } #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ +#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */ #if !COMPILE_TEMPLATE_SSE2 #if !COMPILE_TEMPLATE_AMD3DNOW @@ -2187,6 +2201,44 @@ static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count } } +static void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count) +{ + src ++; + dst += count; + src += 2*count; + count= - count; + + if(count < -16) { + count += 16; + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $8, %%mm7 \n\t" + "1: \n\t" + "movq -32(%1, %0, 2), %%mm0 \n\t" + "movq -24(%1, %0, 2), %%mm1 \n\t" + "movq -16(%1, %0, 2), %%mm2 \n\t" + "movq -8(%1, %0, 2), %%mm3 \n\t" + "pand %%mm7, %%mm0 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm2 \n\t" + "pand %%mm7, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + MOVNTQ" %%mm0,-16(%2, %0) \n\t" + MOVNTQ" %%mm2,- 8(%2, %0) \n\t" + "add $16, %0 \n\t" + " js 1b \n\t" + : "+r"(count) + : "r"(src), "r"(dst) + ); + count -= 16; + } + while(count<0) { + dst[count]= src[2*count]; + count++; + } +} + #if !COMPILE_TEMPLATE_AMD3DNOW static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) { @@ -2397,7 +2449,7 @@ static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { RENAME(extract_even)(src, ydst, width); @@ -2423,7 +2475,7 @@ static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { RENAME(extract_even)(src, ydst, width); @@ -2447,10 +2499,10 @@ static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { - RENAME(extract_even)(src+1, ydst, width); + RENAME(extract_odd)(src, ydst, width); if(y&1) { RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth); udst+= chromStride; @@ -2473,10 +2525,10 @@ static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co int lumStride, int chromStride, int srcStride) { int y; - const int chromWidth= -((-width)>>1); + const int chromWidth = FF_CEIL_RSHIFT(width, 1); for (y=0; y<height; y++) { - RENAME(extract_even)(src+1, ydst, width); + RENAME(extract_odd)(src, ydst, width); RENAME(extract_even2)(src, udst, vdst, chromWidth); src += srcStride; @@ -2529,7 +2581,9 @@ static av_cold void RENAME(rgb2rgb_init)(void) #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW planar2x = RENAME(planar2x); #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */ - rgb24toyv12 = RENAME(rgb24toyv12); +#if HAVE_7REGS + ff_rgb24toyv12 = RENAME(rgb24toyv12); +#endif /* HAVE_7REGS */ yuyvtoyuv420 = RENAME(yuyvtoyuv420); uyvytoyuv420 = RENAME(uyvytoyuv420); @@ -2538,7 +2592,9 @@ static av_cold void RENAME(rgb2rgb_init)(void) #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX interleaveBytes = RENAME(interleaveBytes); #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */ +#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM deinterleaveBytes = RENAME(deinterleaveBytes); #endif +#endif } diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm index 440a27b0ba..7af92f7f52 100644 --- a/libswscale/x86/scale.asm +++ b/libswscale/x86/scale.asm @@ -2,20 +2,20 @@ ;* x86-optimized horizontal line scaling functions ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -407,11 +407,15 @@ SCALE_FUNC %1, %2, X, X8, 7, %3 SCALE_FUNCS 8, 15, %1 SCALE_FUNCS 9, 15, %2 SCALE_FUNCS 10, 15, %2 +SCALE_FUNCS 12, 15, %2 +SCALE_FUNCS 14, 15, %2 SCALE_FUNCS 16, 15, %3 %endif ; !sse4 SCALE_FUNCS 8, 19, %1 SCALE_FUNCS 9, 19, %2 SCALE_FUNCS 10, 19, %2 +SCALE_FUNCS 12, 19, %2 +SCALE_FUNCS 14, 19, %2 SCALE_FUNCS 16, 19, %3 %endmacro @@ -420,7 +424,7 @@ INIT_MMX mmx SCALE_FUNCS2 0, 0, 0 %endif INIT_XMM sse2 -SCALE_FUNCS2 6, 7, 8 +SCALE_FUNCS2 7, 6, 8 INIT_XMM ssse3 SCALE_FUNCS2 6, 6, 8 INIT_XMM sse4 diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index f310a7593f..fe5690d4a3 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -1,20 +1,20 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -23,6 +23,7 @@ #include "libswscale/swscale.h" #include "libswscale/swscale_internal.h" #include "libavutil/attributes.h" +#include "libavutil/avassert.h" #include "libavutil/intreadwrite.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" @@ -57,19 +58,11 @@ DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL; DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL; DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL; -#ifdef FAST_BGR2YV12 -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000000210041000DULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000FFEEFFDC0038ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00000038FFD2FFF8ULL; -#else -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL; -#endif /* FAST_BGR2YV12 */ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL; DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL; + //MMX versions #if HAVE_MMX_INLINE #undef RENAME @@ -87,16 +80,23 @@ DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL; #include "swscale_template.c" #endif -void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex, +void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex, int lastInLumBuf, int lastInChrBuf) { const int dstH= c->dstH; const int flags= c->flags; +#ifdef NEW_FILTER + SwsPlane *lumPlane = &c->slice[c->numSlice-2].plane[0]; + SwsPlane *chrUPlane = &c->slice[c->numSlice-2].plane[1]; + SwsPlane *alpPlane = &c->slice[c->numSlice-2].plane[3]; +#else int16_t **lumPixBuf= c->lumPixBuf; int16_t **chrUPixBuf= c->chrUPixBuf; int16_t **alpPixBuf= c->alpPixBuf; const int vLumBufSize= c->vLumBufSize; const int vChrBufSize= c->vChrBufSize; +#endif + int hasAlpha = c->alpPixBuf != NULL; int32_t *vLumFilterPos= c->vLumFilterPos; int32_t *vChrFilterPos= c->vChrFilterPos; int16_t *vLumFilter= c->vLumFilter; @@ -117,13 +117,22 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI c->greenDither= ff_dither4[dstY&1]; c->redDither= ff_dither8[(dstY+1)&1]; if (dstY < dstH - 2) { - const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; - const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; - const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; +#ifdef NEW_FILTER + const int16_t **lumSrcPtr = (const int16_t **)(void*) lumPlane->line + firstLumSrcY - lumPlane->sliceY; + const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPlane->line + firstChrSrcY - chrUPlane->sliceY; + const int16_t **alpSrcPtr = (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) ? (const int16_t **)(void*) alpPlane->line + firstLumSrcY - alpPlane->sliceY : NULL; +#else + const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; + const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; + const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; +#endif int i; - if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) { +#ifdef NEW_FILTER + const int16_t **tmpY = (const int16_t **) lumPlane->tmp; +#else const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize; +#endif int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize); for (i = 0; i < neg; i++) tmpY[i] = lumSrcPtr[neg]; @@ -134,7 +143,11 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI lumSrcPtr = tmpY; if (alpSrcPtr) { +#ifdef NEW_FILTER + const int16_t **tmpA = (const int16_t **) alpPlane->tmp; +#else const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize; +#endif for (i = 0; i < neg; i++) tmpA[i] = alpSrcPtr[neg]; for ( ; i < end; i++) @@ -145,7 +158,11 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI } } if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) { +#ifdef NEW_FILTER + const int16_t **tmpU = (const int16_t **) chrUPlane->tmp; +#else const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize; +#endif int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize); for (i = 0; i < neg; i++) { tmpU[i] = chrUSrcPtr[neg]; @@ -167,7 +184,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI lumMmxFilter[s*i+APCK_COEF/4 ]= lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ] + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); - if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { + if (CONFIG_SWSCALE_ALPHA && hasAlpha) { *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ]; *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)]; alpMmxFilter[s*i+APCK_COEF/4 ]= @@ -186,8 +203,8 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i]; lumMmxFilter[4*i+2]= lumMmxFilter[4*i+3]= - ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; - if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { + ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U; + if (CONFIG_SWSCALE_ALPHA && hasAlpha) { *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i]; alpMmxFilter[4*i+2]= alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2]; @@ -197,12 +214,90 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i]; chrMmxFilter[4*i+2]= chrMmxFilter[4*i+3]= - ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; + ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U; } } } } +#if HAVE_MMXEXT +static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ + if(((uintptr_t)dest) & 15){ + yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); + return; + } + filterSize--; +#define MAIN_FUNCTION \ + "pxor %%xmm0, %%xmm0 \n\t" \ + "punpcklbw %%xmm0, %%xmm3 \n\t" \ + "movd %4, %%xmm1 \n\t" \ + "punpcklwd %%xmm1, %%xmm1 \n\t" \ + "punpckldq %%xmm1, %%xmm1 \n\t" \ + "punpcklqdq %%xmm1, %%xmm1 \n\t" \ + "psllw $3, %%xmm1 \n\t" \ + "paddw %%xmm1, %%xmm3 \n\t" \ + "psraw $4, %%xmm3 \n\t" \ + "movdqa %%xmm3, %%xmm4 \n\t" \ + "movdqa %%xmm3, %%xmm7 \n\t" \ + "movl %3, %%ecx \n\t" \ + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ".p2align 4 \n\t" /* FIXME Unroll? */\ + "1: \n\t"\ + "movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\ + "movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\ + "movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%xmm0, %%xmm2 \n\t"\ + "pmulhw %%xmm0, %%xmm5 \n\t"\ + "paddw %%xmm2, %%xmm3 \n\t"\ + "paddw %%xmm5, %%xmm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%xmm3 \n\t"\ + "psraw $3, %%xmm4 \n\t"\ + "packuswb %%xmm4, %%xmm3 \n\t"\ + "movntdq %%xmm3, (%1, %%"REG_c")\n\t"\ + "add $16, %%"REG_c" \n\t"\ + "cmp %2, %%"REG_c" \n\t"\ + "movdqa %%xmm7, %%xmm3 \n\t" \ + "movdqa %%xmm7, %%xmm4 \n\t" \ + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t" + + if (offset) { + __asm__ volatile( + "movq %5, %%xmm3 \n\t" + "movdqa %%xmm3, %%xmm4 \n\t" + "psrlq $24, %%xmm3 \n\t" + "psllq $40, %%xmm4 \n\t" + "por %%xmm4, %%xmm3 \n\t" + MAIN_FUNCTION + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), + "m"(filterSize), "m"(((uint64_t *) dither)[0]) + : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) + "%"REG_d, "%"REG_S, "%"REG_c + ); + } else { + __asm__ volatile( + "movq %5, %%xmm3 \n\t" + MAIN_FUNCTION + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), + "m"(filterSize), "m"(((uint64_t *) dither)[0]) + : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) + "%"REG_d, "%"REG_S, "%"REG_c + ); + } +} +#endif + #endif /* HAVE_INLINE_ASM */ #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ @@ -216,10 +311,14 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ SCALE_FUNC(filter_n, 8, 15, opt); \ SCALE_FUNC(filter_n, 9, 15, opt); \ SCALE_FUNC(filter_n, 10, 15, opt); \ + SCALE_FUNC(filter_n, 12, 15, opt); \ + SCALE_FUNC(filter_n, 14, 15, opt); \ SCALE_FUNC(filter_n, 16, 15, opt); \ SCALE_FUNC(filter_n, 8, 19, opt); \ SCALE_FUNC(filter_n, 9, 19, opt); \ SCALE_FUNC(filter_n, 10, 19, opt); \ + SCALE_FUNC(filter_n, 12, 19, opt); \ + SCALE_FUNC(filter_n, 14, 19, opt); \ SCALE_FUNC(filter_n, 16, 19, opt) #define SCALE_FUNCS_MMX(opt) \ @@ -275,11 +374,14 @@ VSCALE_FUNCS(avx, avx); #define INPUT_Y_FUNC(fmt, opt) \ void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \ + const uint8_t *unused1, const uint8_t *unused2, \ int w, uint32_t *unused) #define INPUT_UV_FUNC(fmt, opt) \ void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \ - const uint8_t *src, const uint8_t *unused1, \ - int w, uint32_t *unused2) + const uint8_t *unused0, \ + const uint8_t *src1, \ + const uint8_t *src2, \ + int w, uint32_t *unused) #define INPUT_FUNC(fmt, opt) \ INPUT_Y_FUNC(fmt, opt); \ INPUT_UV_FUNC(fmt, opt) @@ -313,20 +415,31 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c) #if HAVE_MMXEXT_INLINE if (INLINE_MMXEXT(cpu_flags)) sws_init_swscale_mmxext(c); + if (cpu_flags & AV_CPU_FLAG_SSE3){ + if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND)) + c->yuv2planeX = yuv2yuvX_sse3; + } #endif #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \ if (c->srcBpc == 8) { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \ + hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \ ff_hscale8to19_ ## filtersize ## _ ## opt1; \ } else if (c->srcBpc == 9) { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \ + hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \ ff_hscale9to19_ ## filtersize ## _ ## opt1; \ } else if (c->srcBpc == 10) { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \ + hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \ ff_hscale10to19_ ## filtersize ## _ ## opt1; \ - } else /* c->srcBpc == 16 */ { \ - hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \ + } else if (c->srcBpc == 12) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale12to19_ ## filtersize ## _ ## opt1; \ + } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \ + hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \ + ff_hscale14to19_ ## filtersize ## _ ## opt1; \ + } else { /* c->srcBpc == 16 */ \ + av_assert0(c->srcBpc == 16);\ + hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \ ff_hscale16to19_ ## filtersize ## _ ## opt1; \ } \ } while (0) @@ -341,14 +454,15 @@ switch(c->dstBpc){ \ case 16: do_16_case; break; \ case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \ case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \ - default: if (condition_8bit) vscalefn = ff_yuv2planeX_8_ ## opt; break; \ + case 8: if ((condition_8bit) && !c->use_mmx_vfilter) vscalefn = ff_yuv2planeX_8_ ## opt; break; \ } #define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \ switch(c->dstBpc){ \ case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \ case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \ case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \ - default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \ + case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \ + default: av_assert0(c->dstBpc>8); \ } #define case_rgb(x, X, opt) \ case AV_PIX_FMT_ ## X: \ diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index 1e42ec5b12..bbda6d086e 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -1,20 +1,20 @@ /* - * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -25,21 +25,101 @@ #undef REAL_MOVNTQ #undef MOVNTQ +#undef MOVNTQ2 #undef PREFETCH -#if COMPILE_TEMPLATE_MMXEXT -#define PREFETCH "prefetchnta" -#else -#define PREFETCH " # nop" -#endif #if COMPILE_TEMPLATE_MMXEXT #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" +#define MOVNTQ2 "movntq " #else #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" +#define MOVNTQ2 "movq " #endif #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) +#if !COMPILE_TEMPLATE_MMXEXT +static av_always_inline void +dither_8to16(const uint8_t *srcDither, int rot) +{ + if (rot) { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "psrlq $24, %%mm3\n\t" + "psllq $40, %%mm4\n\t" + "por %%mm4, %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + :: "r"(srcDither) + ); + } else { + __asm__ volatile("pxor %%mm0, %%mm0\n\t" + "movq (%0), %%mm3\n\t" + "movq %%mm3, %%mm4\n\t" + "punpcklbw %%mm0, %%mm3\n\t" + "punpckhbw %%mm0, %%mm4\n\t" + :: "r"(srcDither) + ); + } +} +#endif + +static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ + dither_8to16(dither, offset); + filterSize--; + __asm__ volatile( + "movd %0, %%mm1\n\t" + "punpcklwd %%mm1, %%mm1\n\t" + "punpckldq %%mm1, %%mm1\n\t" + "psllw $3, %%mm1\n\t" + "paddw %%mm1, %%mm3\n\t" + "paddw %%mm1, %%mm4\n\t" + "psraw $4, %%mm3\n\t" + "psraw $4, %%mm4\n\t" + ::"m"(filterSize) + ); + + __asm__ volatile(\ + "movq %%mm3, %%mm6\n\t" + "movq %%mm4, %%mm7\n\t" + "movl %3, %%ecx\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ".p2align 4 \n\t" /* FIXME Unroll? */\ + "1: \n\t"\ + "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ + "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ + "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t"\ + "pmulhw %%mm0, %%mm5 \n\t"\ + "paddw %%mm2, %%mm3 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%mm3 \n\t"\ + "psraw $3, %%mm4 \n\t"\ + "packuswb %%mm4, %%mm3 \n\t" + MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" + "add $8, %%"REG_c" \n\t"\ + "cmp %2, %%"REG_c" \n\t"\ + "movq %%mm6, %%mm3\n\t" + "movq %%mm7, %%mm4\n\t" + "mov %0, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "g" (filter), + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) + : "%"REG_d, "%"REG_S, "%"REG_c + ); +} + #define YSCALEYUV2PACKEDX_UV \ __asm__ volatile(\ "xor %%"REG_a", %%"REG_a" \n\t"\ @@ -92,6 +172,7 @@ :: "r" (&c->redDither), \ "m" (dummy), "m" (dummy), "m" (dummy),\ "r" (dest), "m" (dstW_reg), "m"(uv_off) \ + NAMED_CONSTRAINTS_ADD(bF8,bFC) \ : "%"REG_a, "%"REG_d, "%"REG_S \ ); @@ -252,7 +333,7 @@ MOVNTQ( q3, 24(dst, index, 4))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) @@ -265,7 +346,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX_ACCURATE @@ -278,13 +359,13 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) YSCALEYUV2PACKEDX_END } else { YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) YSCALEYUV2PACKEDX_END } } @@ -298,7 +379,36 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; + + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { + YSCALEYUV2PACKEDX + YSCALEYUV2RGBX + YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) + "psraw $3, %%mm1 \n\t" + "psraw $3, %%mm7 \n\t" + "packuswb %%mm7, %%mm1 \n\t" + WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + YSCALEYUV2PACKEDX_END + } else { + YSCALEYUV2PACKEDX + YSCALEYUV2RGBX + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + YSCALEYUV2PACKEDX_END + } +} + +static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, + int chrFilterSize, const int16_t **alpSrc, + uint8_t *dest, int dstW, int dstY) +{ + x86_reg dummy=0; + x86_reg dstW_reg = dstW; + x86_reg uv_off = c->uv_offx2; if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX @@ -307,13 +417,13 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) YSCALEYUV2PACKEDX_END } else { YSCALEYUV2PACKEDX YSCALEYUV2RGBX "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%4, "%5", %%REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) YSCALEYUV2PACKEDX_END } } @@ -342,7 +452,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, MOVNTQ(%%mm1, 8(dst, index, 2))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) @@ -355,7 +465,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -366,7 +476,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" "paddusb "RED_DITHER"(%0), %%mm5\n\t" #endif - WRITERGB16(%4, %5, %%REGa) + WRITERGB16(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -379,7 +489,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -390,7 +500,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" "paddusb "RED_DITHER"(%0), %%mm5 \n\t" #endif - WRITERGB16(%4, %5, %%REGa) + WRITERGB16(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -419,7 +529,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, MOVNTQ(%%mm1, 8(dst, index, 2))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) @@ -432,7 +542,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX @@ -443,7 +553,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" "paddusb "RED_DITHER"(%0), %%mm5\n\t" #endif - WRITERGB15(%4, %5, %%REGa) + WRITERGB15(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -456,7 +566,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX @@ -467,7 +577,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" "paddusb "RED_DITHER"(%0), %%mm5 \n\t" #endif - WRITERGB15(%4, %5, %%REGa) + WRITERGB15(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -521,7 +631,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, "add $24, "#dst" \n\t"\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITEBGR24MMXEXT(dst, dstw, index) \ @@ -569,7 +679,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, "add $24, "#dst" \n\t"\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #if COMPILE_TEMPLATE_MMXEXT @@ -580,6 +690,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) #endif +#if HAVE_6REGS static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, @@ -589,17 +700,18 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize "add %4, %%"REG_c" \n\t" - WRITEBGR24(%%REGc, %5, %%REGa) + WRITEBGR24(%%REGc, "%5", %%REGa) :: "r" (&c->redDither), "m" (dummy), "m" (dummy), "m" (dummy), "r" (dest), "m" (dstW_reg), "m"(uv_off) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S ); } @@ -613,20 +725,22 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX YSCALEYUV2RGBX "pxor %%mm7, %%mm7 \n\t" "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize "add %4, %%"REG_c" \n\t" - WRITEBGR24(%%REGc, %5, %%REGa) + WRITEBGR24(%%REGc, "%5", %%REGa) :: "r" (&c->redDither), "m" (dummy), "m" (dummy), "m" (dummy), "r" (dest), "m" (dstW_reg), "m"(uv_off) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S ); } +#endif /* HAVE_6REGS */ #define REAL_WRITEYUY2(dst, dstw, index) \ "packuswb %%mm3, %%mm3 \n\t"\ @@ -641,7 +755,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, MOVNTQ(%%mm7, 8(dst, index, 2))\ \ "add $8, "#index" \n\t"\ - "cmp "#dstw", "#index" \n\t"\ + "cmp "dstw", "#index" \n\t"\ " jb 1b \n\t" #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) @@ -654,7 +768,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX_ACCURATE /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -662,7 +776,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm4 \n\t" "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" - WRITEYUY2(%4, %5, %%REGa) + WRITEYUY2(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -675,7 +789,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, { x86_reg dummy=0; x86_reg dstW_reg = dstW; - x86_reg uv_off = c->uv_off_byte; + x86_reg uv_off = c->uv_offx2; YSCALEYUV2PACKEDX /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ @@ -683,7 +797,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, "psraw $3, %%mm4 \n\t" "psraw $3, %%mm1 \n\t" "psraw $3, %%mm7 \n\t" - WRITEYUY2(%4, %5, %%REGa) + WRITEYUY2(%4, "%5", %%REGa) YSCALEYUV2PACKEDX_END } @@ -784,15 +898,15 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), "a" (&c->redDither), "r" (abuf0), "r" (abuf1) : "%r8" ); #else - *(const uint16_t **)(&c->u_temp)=abuf0; - *(const uint16_t **)(&c->v_temp)=abuf1; + c->u_temp=(intptr_t)abuf0; + c->v_temp=(intptr_t)abuf1; __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -808,7 +922,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], "packuswb %%mm7, %%mm1 \n\t" "pop %1 \n\t" "pop %0 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -822,7 +936,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], "push %%"REG_BP" \n\t" YSCALEYUV2RGB(%%REGBP, %5) "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -839,18 +953,18 @@ static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2RGB(%%REGBP, %5) "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) ); } @@ -862,7 +976,6 @@ static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -875,11 +988,12 @@ static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) + WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8) ); } @@ -891,7 +1005,6 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -904,11 +1017,12 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) + WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8,bFC) ); } @@ -960,13 +1074,12 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], const int16_t *buf0 = buf[0], *buf1 = buf[1], *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2PACKED(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1109,7 +1222,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1(%%REGBP, %5) YSCALEYUV2RGB1_ALPHA(%%REGBP) - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1122,7 +1235,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1(%%REGBP, %5) "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1138,7 +1251,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1b(%%REGBP, %5) YSCALEYUV2RGB1_ALPHA(%%REGBP) - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1151,7 +1264,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1b(%%REGBP, %5) "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1177,11 +1290,12 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1(%%REGBP, %5) "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) ); } else { const int16_t *ubuf1 = ubuf[1]; @@ -1191,11 +1305,12 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, "push %%"REG_BP" \n\t" YSCALEYUV2RGB1b(%%REGBP, %5) "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) + WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) ); } } @@ -1222,11 +1337,12 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) + WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8) ); } else { const int16_t *ubuf1 = ubuf[1]; @@ -1242,11 +1358,12 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) + WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8) ); } } @@ -1273,11 +1390,12 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) + WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8,bFC) ); } else { const int16_t *ubuf1 = ubuf[1]; @@ -1293,11 +1411,12 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) + WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), "a" (&c->redDither) + NAMED_CONSTRAINTS_ADD(bF8,bFC) ); } } @@ -1354,7 +1473,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2PACKED1(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1367,7 +1486,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2PACKED1b(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), @@ -1375,203 +1494,20 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, ); } } - -#if COMPILE_TEMPLATE_MMXEXT -static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, - int dstWidth, const uint8_t *src, - int srcW, int xInc) -{ - int32_t *filterPos = c->hLumFilterPos; - int16_t *filter = c->hLumFilter; - void *mmxextFilterCode = c->lumMmxextFilterCode; - int i; -#if defined(PIC) - uint64_t ebxsave; -#endif -#if ARCH_X86_64 - uint64_t retsave; -#endif - - __asm__ volatile( -#if defined(PIC) - "mov %%"REG_b", %5 \n\t" -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %6 \n\t" -#endif -#else -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %5 \n\t" -#endif -#endif - "pxor %%mm7, %%mm7 \n\t" - "mov %0, %%"REG_c" \n\t" - "mov %1, %%"REG_D" \n\t" - "mov %2, %%"REG_d" \n\t" - "mov %3, %%"REG_b" \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" // i - PREFETCH" (%%"REG_c") \n\t" - PREFETCH" 32(%%"REG_c") \n\t" - PREFETCH" 64(%%"REG_c") \n\t" - -#if ARCH_X86_64 -#define CALL_MMXEXT_FILTER_CODE \ - "movl (%%"REG_b"), %%esi \n\t"\ - "call *%4 \n\t"\ - "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ - "add %%"REG_S", %%"REG_c" \n\t"\ - "add %%"REG_a", %%"REG_D" \n\t"\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - -#else -#define CALL_MMXEXT_FILTER_CODE \ - "movl (%%"REG_b"), %%esi \n\t"\ - "call *%4 \n\t"\ - "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ - "add %%"REG_a", %%"REG_D" \n\t"\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - -#endif /* ARCH_X86_64 */ - - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - -#if defined(PIC) - "mov %5, %%"REG_b" \n\t" -#if ARCH_X86_64 - "mov %6, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#else -#if ARCH_X86_64 - "mov %5, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#endif - :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), - "m" (mmxextFilterCode) -#if defined(PIC) - ,"m" (ebxsave) -#endif -#if ARCH_X86_64 - ,"m"(retsave) -#endif - : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D -#if !defined(PIC) - ,"%"REG_b -#endif - ); - - for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) - dst[i] = src[srcW-1]*128; -} - -static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2, - int dstWidth, const uint8_t *src1, - const uint8_t *src2, int srcW, int xInc) -{ - int32_t *filterPos = c->hChrFilterPos; - int16_t *filter = c->hChrFilter; - void *mmxextFilterCode = c->chrMmxextFilterCode; - int i; -#if defined(PIC) - DECLARE_ALIGNED(8, uint64_t, ebxsave); -#endif -#if ARCH_X86_64 - DECLARE_ALIGNED(8, uint64_t, retsave); -#endif - - __asm__ volatile( -#if defined(PIC) - "mov %%"REG_b", %7 \n\t" -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %8 \n\t" -#endif -#else -#if ARCH_X86_64 - "mov -8(%%rsp), %%"REG_a" \n\t" - "mov %%"REG_a", %7 \n\t" -#endif -#endif - "pxor %%mm7, %%mm7 \n\t" - "mov %0, %%"REG_c" \n\t" - "mov %1, %%"REG_D" \n\t" - "mov %2, %%"REG_d" \n\t" - "mov %3, %%"REG_b" \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" // i - PREFETCH" (%%"REG_c") \n\t" - PREFETCH" 32(%%"REG_c") \n\t" - PREFETCH" 64(%%"REG_c") \n\t" - - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - "xor %%"REG_a", %%"REG_a" \n\t" // i - "mov %5, %%"REG_c" \n\t" // src - "mov %6, %%"REG_D" \n\t" // buf2 - PREFETCH" (%%"REG_c") \n\t" - PREFETCH" 32(%%"REG_c") \n\t" - PREFETCH" 64(%%"REG_c") \n\t" - - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - CALL_MMXEXT_FILTER_CODE - -#if defined(PIC) - "mov %7, %%"REG_b" \n\t" -#if ARCH_X86_64 - "mov %8, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#else -#if ARCH_X86_64 - "mov %7, %%"REG_a" \n\t" - "mov %%"REG_a", -8(%%rsp) \n\t" -#endif -#endif - :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), - "m" (mmxextFilterCode), "m" (src2), "m"(dst2) -#if defined(PIC) - ,"m" (ebxsave) -#endif -#if ARCH_X86_64 - ,"m"(retsave) -#endif - : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D -#if !defined(PIC) - ,"%"REG_b -#endif - ); - - for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { - dst1[i] = src1[srcW-1]*128; - dst2[i] = src2[srcW-1]*128; - } -} -#endif /* COMPILE_TEMPLATE_MMXEXT */ - static av_cold void RENAME(sws_init_swscale)(SwsContext *c) { enum AVPixelFormat dstFormat = c->dstFormat; - if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && - dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21) { - if (!(c->flags & SWS_BITEXACT)) { + c->use_mmx_vfilter= 0; + if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 + && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { if (c->flags & SWS_ACCURATE_RND) { if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; +#if HAVE_6REGS case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; +#endif case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; @@ -1579,10 +1515,15 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } } else { + c->use_mmx_vfilter= 1; + c->yuv2planeX = RENAME(yuv2yuvX ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; + case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break; +#if HAVE_6REGS case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; +#endif case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; @@ -1590,7 +1531,6 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } } - } if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case AV_PIX_FMT_RGB32: @@ -1619,12 +1559,12 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c) } } - if (c->srcBpc == 8 && c->dstBpc <= 10) { + if (c->srcBpc == 8 && c->dstBpc <= 14) { // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). #if COMPILE_TEMPLATE_MMXEXT if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { - c->hyscale_fast = RENAME(hyscale_fast); - c->hcscale_fast = RENAME(hcscale_fast); + c->hyscale_fast = ff_hyscale_fast_mmxext; + c->hcscale_fast = ff_hcscale_fast_mmxext; } else { #endif /* COMPILE_TEMPLATE_MMXEXT */ c->hyscale_fast = NULL; diff --git a/libswscale/x86/w64xmmtest.c b/libswscale/x86/w64xmmtest.c index dd9a2a4378..88143d9687 100644 --- a/libswscale/x86/w64xmmtest.c +++ b/libswscale/x86/w64xmmtest.c @@ -2,20 +2,20 @@ * check XMM registers for clobbers on Win64 * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c index bacc87f6c7..5e2f77c20f 100644 --- a/libswscale/x86/yuv2rgb.c +++ b/libswscale/x86/yuv2rgb.c @@ -7,27 +7,26 @@ * 1,4,8bpp support and context / deglobalize stuff * by Michael Niedermayer (michaelni@gmx.at) * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include <stdio.h> #include <stdlib.h> #include <inttypes.h> -#include <assert.h> #include "config.h" #include "libswscale/rgb2rgb.h" @@ -51,34 +50,30 @@ DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL; DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL; //MMX versions -#if HAVE_MMX_INLINE +#if HAVE_MMX_INLINE && HAVE_6REGS #undef RENAME #undef COMPILE_TEMPLATE_MMXEXT #define COMPILE_TEMPLATE_MMXEXT 0 #define RENAME(a) a ## _mmx #include "yuv2rgb_template.c" -#endif /* HAVE_MMX_INLINE */ +#endif /* HAVE_MMX_INLINE && HAVE_6REGS */ // MMXEXT versions -#if HAVE_MMXEXT_INLINE +#if HAVE_MMXEXT_INLINE && HAVE_6REGS #undef RENAME #undef COMPILE_TEMPLATE_MMXEXT #define COMPILE_TEMPLATE_MMXEXT 1 #define RENAME(a) a ## _mmxext #include "yuv2rgb_template.c" -#endif /* HAVE_MMXEXT_INLINE */ +#endif /* HAVE_MMXEXT_INLINE && HAVE_6REGS */ #endif /* HAVE_INLINE_ASM */ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c) { -#if HAVE_MMX_INLINE +#if HAVE_MMX_INLINE && HAVE_6REGS int cpu_flags = av_get_cpu_flags(); - if (c->srcFormat != AV_PIX_FMT_YUV420P && - c->srcFormat != AV_PIX_FMT_YUVA420P) - return NULL; - #if HAVE_MMXEXT_INLINE if (INLINE_MMXEXT(cpu_flags)) { switch (c->dstFormat) { @@ -118,7 +113,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c) return yuv420_rgb15_mmx; } } -#endif /* HAVE_MMX_INLINE */ +#endif /* HAVE_MMX_INLINE && HAVE_6REGS */ return NULL; } diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c index 0b9751623e..acb78f520e 100644 --- a/libswscale/x86/yuv2rgb_template.c +++ b/libswscale/x86/yuv2rgb_template.c @@ -4,20 +4,20 @@ * Copyright (C) 2001-2007 Michael Niedermayer * (c) 2010 Konstantin Shishkov * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -48,17 +48,14 @@ if (h_size * depth > FFABS(dstStride[0])) \ h_size -= 8; \ \ - if (c->srcFormat == AV_PIX_FMT_YUV422P) { \ - srcStride[1] *= 2; \ - srcStride[2] *= 2; \ - } \ + vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \ \ __asm__ volatile ("pxor %mm4, %mm4\n\t"); \ for (y = 0; y < srcSliceH; y++) { \ uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \ const uint8_t *py = src[0] + y * srcStride[0]; \ - const uint8_t *pu = src[1] + (y >> 1) * srcStride[1]; \ - const uint8_t *pv = src[2] + (y >> 1) * srcStride[2]; \ + const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \ + const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \ x86_reg index = -h_size / 2; \ #define YUV2RGB_INITIAL_LOAD \ @@ -142,10 +139,21 @@ "add $4, %0\n\t" \ "js 1b\n\t" \ +#if COMPILE_TEMPLATE_MMXEXT +#undef RGB_PACK24_B_OPERANDS +#define RGB_PACK24_B_OPERANDS NAMED_CONSTRAINTS_ARRAY_ADD(mask1101,mask0110,mask0100,mask0010,mask1001) +#else +#undef RGB_PACK24_B_OPERANDS +#define RGB_PACK24_B_OPERANDS +#endif + #define YUV2RGB_OPERANDS \ : "+r" (index), "+r" (image) \ : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \ "r" (py - 2*index) \ + NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_03,pb_07,mmx_redmask,pb_e0) \ + RGB_PACK24_B_OPERANDS \ + : "memory" \ ); \ } \ @@ -153,6 +161,8 @@ : "+r" (index), "+r" (image) \ : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \ "r" (py - 2*index), "r" (pa - 2*index) \ + NAMED_CONSTRAINTS_ADD(mmx_00ffw) \ + : "memory" \ ); \ } \ @@ -193,7 +203,7 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(2) @@ -221,7 +231,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(2) @@ -311,7 +321,7 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(3) @@ -329,7 +339,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(3) @@ -373,7 +383,7 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) @@ -394,7 +404,7 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) @@ -416,7 +426,7 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) @@ -437,7 +447,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[], int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]) { - int y, h_size; + int y, h_size, vshift; YUV2RGB_LOOP(4) |