summaryrefslogtreecommitdiff
path: root/libswscale/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libswscale/x86')
-rw-r--r--libswscale/x86/Makefile4
-rw-r--r--libswscale/x86/hscale_fast_bilinear_simd.c359
-rw-r--r--libswscale/x86/input.asm252
-rw-r--r--libswscale/x86/output.asm172
-rw-r--r--libswscale/x86/rgb2rgb.c17
-rw-r--r--libswscale/x86/rgb2rgb_template.c240
-rw-r--r--libswscale/x86/scale.asm14
-rw-r--r--libswscale/x86/swscale.c176
-rw-r--r--libswscale/x86/swscale_template.c460
-rw-r--r--libswscale/x86/w64xmmtest.c8
-rw-r--r--libswscale/x86/yuv2rgb.c24
-rw-r--r--libswscale/x86/yuv2rgb_template.c46
12 files changed, 1168 insertions, 604 deletions
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index b94b14abbb..69012077bb 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -1,7 +1,11 @@
+$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+
OBJS += x86/rgb2rgb.o \
x86/swscale.o \
x86/yuv2rgb.o \
+MMX-OBJS += x86/hscale_fast_bilinear_simd.o \
+
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
YASM-OBJS += x86/input.o \
diff --git a/libswscale/x86/hscale_fast_bilinear_simd.c b/libswscale/x86/hscale_fast_bilinear_simd.c
new file mode 100644
index 0000000000..b37b63c3ec
--- /dev/null
+++ b/libswscale/x86/hscale_fast_bilinear_simd.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../swscale_internal.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+
+#define RET 0xC3 // near return opcode for x86
+#define PREFETCH "prefetchnta"
+
+#if HAVE_INLINE_ASM
+av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
+ int16_t *filter, int32_t *filterPos,
+ int numSplits)
+{
+ uint8_t *fragmentA;
+ x86_reg imm8OfPShufW1A;
+ x86_reg imm8OfPShufW2A;
+ x86_reg fragmentLengthA;
+ uint8_t *fragmentB;
+ x86_reg imm8OfPShufW1B;
+ x86_reg imm8OfPShufW2B;
+ x86_reg fragmentLengthB;
+ int fragmentPos;
+
+ int xpos, i;
+
+ // create an optimized horizontal scaling routine
+ /* This scaler is made of runtime-generated MMXEXT code using specially tuned
+ * pshufw instructions. For every four output pixels, if four input pixels
+ * are enough for the fast bilinear scaling, then a chunk of fragmentB is
+ * used. If five input pixels are needed, then a chunk of fragmentA is used.
+ */
+
+ // code fragment
+
+ __asm__ volatile (
+ "jmp 9f \n\t"
+ // Begin
+ "0: \n\t"
+ "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
+ "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
+ "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "pshufw $0xFF, %%mm1, %%mm1 \n\t"
+ "1: \n\t"
+ "pshufw $0xFF, %%mm0, %%mm0 \n\t"
+ "2: \n\t"
+ "psubw %%mm1, %%mm0 \n\t"
+ "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
+ "pmullw %%mm3, %%mm0 \n\t"
+ "psllw $7, %%mm1 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+
+ "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
+
+ "add $8, %%"REG_a" \n\t"
+ // End
+ "9: \n\t"
+ "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
+ "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
+ "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
+ "dec %1 \n\t"
+ "dec %2 \n\t"
+ "sub %0, %1 \n\t"
+ "sub %0, %2 \n\t"
+ "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
+ "sub %0, %3 \n\t"
+
+
+ : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
+ "=r" (fragmentLengthA)
+ );
+
+ __asm__ volatile (
+ "jmp 9f \n\t"
+ // Begin
+ "0: \n\t"
+ "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
+ "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "pshufw $0xFF, %%mm0, %%mm1 \n\t"
+ "1: \n\t"
+ "pshufw $0xFF, %%mm0, %%mm0 \n\t"
+ "2: \n\t"
+ "psubw %%mm1, %%mm0 \n\t"
+ "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
+ "pmullw %%mm3, %%mm0 \n\t"
+ "psllw $7, %%mm1 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+
+ "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
+
+ "add $8, %%"REG_a" \n\t"
+ // End
+ "9: \n\t"
+ "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
+ "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
+ "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
+ "dec %1 \n\t"
+ "dec %2 \n\t"
+ "sub %0, %1 \n\t"
+ "sub %0, %2 \n\t"
+ "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
+ "sub %0, %3 \n\t"
+
+
+ : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
+ "=r" (fragmentLengthB)
+ );
+
+ xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
+ fragmentPos = 0;
+
+ for (i = 0; i < dstW / numSplits; i++) {
+ int xx = xpos >> 16;
+
+ if ((i & 3) == 0) {
+ int a = 0;
+ int b = ((xpos + xInc) >> 16) - xx;
+ int c = ((xpos + xInc * 2) >> 16) - xx;
+ int d = ((xpos + xInc * 3) >> 16) - xx;
+ int inc = (d + 1 < 4);
+ uint8_t *fragment = inc ? fragmentB : fragmentA;
+ x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
+ x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
+ x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
+ int maxShift = 3 - (d + inc);
+ int shift = 0;
+
+ if (filterCode) {
+ filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
+ filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
+ filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
+ filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
+ filterPos[i / 2] = xx;
+
+ memcpy(filterCode + fragmentPos, fragment, fragmentLength);
+
+ filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
+ ((b + inc) << 2) |
+ ((c + inc) << 4) |
+ ((d + inc) << 6);
+ filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
+ (c << 4) |
+ (d << 6);
+
+ if (i + 4 - inc >= dstW)
+ shift = maxShift; // avoid overread
+ else if ((filterPos[i / 2] & 3) <= maxShift)
+ shift = filterPos[i / 2] & 3; // align
+
+ if (shift && i >= shift) {
+ filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
+ filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
+ filterPos[i / 2] -= shift;
+ }
+ }
+
+ fragmentPos += fragmentLength;
+
+ if (filterCode)
+ filterCode[fragmentPos] = RET;
+ }
+ xpos += xInc;
+ }
+ if (filterCode)
+ filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
+
+ return fragmentPos + 1;
+}
+
+void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
+ int dstWidth, const uint8_t *src,
+ int srcW, int xInc)
+{
+ int32_t *filterPos = c->hLumFilterPos;
+ int16_t *filter = c->hLumFilter;
+ void *mmxextFilterCode = c->lumMmxextFilterCode;
+ int i;
+#if ARCH_X86_64
+ uint64_t retsave;
+#else
+#if defined(PIC)
+ uint64_t ebxsave;
+#endif
+#endif
+
+ __asm__ volatile(
+#if ARCH_X86_64
+ "mov -8(%%rsp), %%"REG_a" \n\t"
+ "mov %%"REG_a", %5 \n\t" // retsave
+#else
+#if defined(PIC)
+ "mov %%"REG_b", %5 \n\t" // ebxsave
+#endif
+#endif
+ "pxor %%mm7, %%mm7 \n\t"
+ "mov %0, %%"REG_c" \n\t"
+ "mov %1, %%"REG_D" \n\t"
+ "mov %2, %%"REG_d" \n\t"
+ "mov %3, %%"REG_b" \n\t"
+ "xor %%"REG_a", %%"REG_a" \n\t" // i
+ PREFETCH" (%%"REG_c") \n\t"
+ PREFETCH" 32(%%"REG_c") \n\t"
+ PREFETCH" 64(%%"REG_c") \n\t"
+
+#if ARCH_X86_64
+#define CALL_MMXEXT_FILTER_CODE \
+ "movl (%%"REG_b"), %%esi \n\t"\
+ "call *%4 \n\t"\
+ "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
+ "add %%"REG_S", %%"REG_c" \n\t"\
+ "add %%"REG_a", %%"REG_D" \n\t"\
+ "xor %%"REG_a", %%"REG_a" \n\t"\
+
+#else
+#define CALL_MMXEXT_FILTER_CODE \
+ "movl (%%"REG_b"), %%esi \n\t"\
+ "call *%4 \n\t"\
+ "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
+ "add %%"REG_a", %%"REG_D" \n\t"\
+ "xor %%"REG_a", %%"REG_a" \n\t"\
+
+#endif /* ARCH_X86_64 */
+
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+
+#if ARCH_X86_64
+ "mov %5, %%"REG_a" \n\t"
+ "mov %%"REG_a", -8(%%rsp) \n\t"
+#else
+#if defined(PIC)
+ "mov %5, %%"REG_b" \n\t"
+#endif
+#endif
+ :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
+ "m" (mmxextFilterCode)
+#if ARCH_X86_64
+ ,"m"(retsave)
+#else
+#if defined(PIC)
+ ,"m" (ebxsave)
+#endif
+#endif
+ : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
+#if ARCH_X86_64 || !defined(PIC)
+ ,"%"REG_b
+#endif
+ );
+
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+ dst[i] = src[srcW-1]*128;
+}
+
+void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
+ int dstWidth, const uint8_t *src1,
+ const uint8_t *src2, int srcW, int xInc)
+{
+ int32_t *filterPos = c->hChrFilterPos;
+ int16_t *filter = c->hChrFilter;
+ void *mmxextFilterCode = c->chrMmxextFilterCode;
+ int i;
+#if ARCH_X86_64
+ DECLARE_ALIGNED(8, uint64_t, retsave);
+#else
+#if defined(PIC)
+ DECLARE_ALIGNED(8, uint64_t, ebxsave);
+#endif
+#endif
+ __asm__ volatile(
+#if ARCH_X86_64
+ "mov -8(%%rsp), %%"REG_a" \n\t"
+ "mov %%"REG_a", %7 \n\t" // retsave
+#else
+#if defined(PIC)
+ "mov %%"REG_b", %7 \n\t" // ebxsave
+#endif
+#endif
+ "pxor %%mm7, %%mm7 \n\t"
+ "mov %0, %%"REG_c" \n\t"
+ "mov %1, %%"REG_D" \n\t"
+ "mov %2, %%"REG_d" \n\t"
+ "mov %3, %%"REG_b" \n\t"
+ "xor %%"REG_a", %%"REG_a" \n\t" // i
+ PREFETCH" (%%"REG_c") \n\t"
+ PREFETCH" 32(%%"REG_c") \n\t"
+ PREFETCH" 64(%%"REG_c") \n\t"
+
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ "xor %%"REG_a", %%"REG_a" \n\t" // i
+ "mov %5, %%"REG_c" \n\t" // src2
+ "mov %6, %%"REG_D" \n\t" // dst2
+ PREFETCH" (%%"REG_c") \n\t"
+ PREFETCH" 32(%%"REG_c") \n\t"
+ PREFETCH" 64(%%"REG_c") \n\t"
+
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+ CALL_MMXEXT_FILTER_CODE
+
+#if ARCH_X86_64
+ "mov %7, %%"REG_a" \n\t"
+ "mov %%"REG_a", -8(%%rsp) \n\t"
+#else
+#if defined(PIC)
+ "mov %7, %%"REG_b" \n\t"
+#endif
+#endif
+ :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
+ "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
+#if ARCH_X86_64
+ ,"m"(retsave)
+#else
+#if defined(PIC)
+ ,"m" (ebxsave)
+#endif
+#endif
+ : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
+#if ARCH_X86_64 || !defined(PIC)
+ ,"%"REG_b
+#endif
+ );
+
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
+ dst1[i] = src1[srcW-1]*128;
+ dst2[i] = src2[srcW-1]*128;
+ }
+}
+#endif //HAVE_INLINE_ASM
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index 6f5677e1fd..af9afcaa53 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -4,20 +4,20 @@
;* into YUV planes also.
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -35,33 +35,59 @@ SECTION_RODATA
%define GV 0xD0E3
%define BV 0xF6E4
-rgb_Yrnd: times 4 dd 0x84000 ; 16.5 << 15
-rgb_UVrnd: times 4 dd 0x404000 ; 128.5 << 15
-bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
-bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
-rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
-rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
-bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
-bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
-rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
-rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
-bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
-bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
-rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
-rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
-
-rgba_Ycoeff_rb: times 4 dw RY, BY
-rgba_Ycoeff_br: times 4 dw BY, RY
-rgba_Ycoeff_ga: times 4 dw GY, 0
-rgba_Ycoeff_ag: times 4 dw 0, GY
-rgba_Ucoeff_rb: times 4 dw RU, BU
-rgba_Ucoeff_br: times 4 dw BU, RU
-rgba_Ucoeff_ga: times 4 dw GU, 0
-rgba_Ucoeff_ag: times 4 dw 0, GU
-rgba_Vcoeff_rb: times 4 dw RV, BV
-rgba_Vcoeff_br: times 4 dw BV, RV
-rgba_Vcoeff_ga: times 4 dw GV, 0
-rgba_Vcoeff_ag: times 4 dw 0, GV
+rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15
+rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15
+%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq
+%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq
+%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq
+%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq
+%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq
+%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq
+%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq
+%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq
+%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq
+%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq
+%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq
+%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq
+
+%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq
+%define rgba_Ycoeff_br 16*4 + 16*13 + tableq
+%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq
+%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq
+%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq
+%define rgba_Ucoeff_br 16*4 + 16*17 + tableq
+%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq
+%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq
+%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq
+%define rgba_Vcoeff_br 16*4 + 16*21 + tableq
+%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq
+%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq
+
+; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
+; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
+; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
+; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
+; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
+; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
+; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
+; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
+; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
+; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
+; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
+; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
+
+; rgba_Ycoeff_rb: times 4 dw RY, BY
+; rgba_Ycoeff_br: times 4 dw BY, RY
+; rgba_Ycoeff_ga: times 4 dw GY, 0
+; rgba_Ycoeff_ag: times 4 dw 0, GY
+; rgba_Ucoeff_rb: times 4 dw RU, BU
+; rgba_Ucoeff_br: times 4 dw BU, RU
+; rgba_Ucoeff_ga: times 4 dw GU, 0
+; rgba_Ucoeff_ag: times 4 dw 0, GU
+; rgba_Vcoeff_rb: times 4 dw RV, BV
+; rgba_Vcoeff_br: times 4 dw BV, RV
+; rgba_Vcoeff_ga: times 4 dw GV, 0
+; rgba_Vcoeff_ag: times 4 dw 0, GV
shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \
6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80
@@ -82,7 +108,7 @@ SECTION .text
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_Y_FN 2-3
-cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
+cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
%if mmsize == 8
mova m5, [%2_Ycoeff_12x4]
mova m6, [%2_Ycoeff_3x56]
@@ -114,6 +140,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
+ add wq, wq
add dstq, wq
neg wq
%if notcpuflag(ssse3)
@@ -157,12 +184,11 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
- psrad m0, 15
- psrad m2, 15
+ psrad m0, 9
+ psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
- packuswb m0, m0 ; (byte) { Y[0-7] }
- movh [dstq+wq], m0
- add wq, mmsize / 2
+ mova [dstq+wq], m0
+ add wq, mmsize
jl .loop
REP_RET
%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
@@ -171,7 +197,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_UV_FN 2-3
-cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [%2_Ucoeff_12x4]
mova m9, [%2_Ucoeff_3x56]
@@ -202,10 +228,11 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
+ add wq, wq
add dstUq, wq
add dstVq, wq
neg wq
@@ -263,23 +290,20 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
- psrad m0, 15
- psrad m2, 15
- psrad m1, 15
- psrad m4, 15
+ psrad m0, 9
+ psrad m2, 9
+ psrad m1, 9
+ psrad m4, 9
packssdw m0, m1 ; (word) { U[0-7] }
packssdw m2, m4 ; (word) { V[0-7] }
%if mmsize == 8
- packuswb m0, m0 ; (byte) { U[0-3] }
- packuswb m2, m2 ; (byte) { V[0-3] }
- movh [dstUq+wq], m0
- movh [dstVq+wq], m2
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%else ; mmsize == 16
- packuswb m0, m2 ; (byte) { U[0-7], V[0-7] }
- movh [dstUq+wq], m0
- movhps [dstVq+wq], m0
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%endif ; mmsize == 8/16
- add wq, mmsize / 2
+ add wq, mmsize
jl .loop
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
@@ -305,13 +329,15 @@ RGB24_FUNCS 10, 12
INIT_XMM ssse3
RGB24_FUNCS 11, 13
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB24_FUNCS 11, 13
+%endif
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_Y_FN 5-6
-cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
mova m5, [rgba_Ycoeff_%2%4]
mova m6, [rgba_Ycoeff_%3%5]
%if %0 == 6
@@ -321,7 +347,9 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
- lea srcq, [srcq+wq*4]
+ add wq, wq
+ sub wq, mmsize - 1
+ lea srcq, [srcq+wq*2]
add dstq, wq
neg wq
mova m4, [rgb_Yrnd]
@@ -329,8 +357,8 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
psrlw m7, 8 ; (word) { 0x00ff } x4
.loop:
; FIXME check alignment and use mova
- movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
- movu m2, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+ movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
@@ -340,13 +368,29 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
paddd m2, m4 ; += rgb_Yrnd
paddd m0, m1 ; (dword) { Y[0-3] }
paddd m2, m3 ; (dword) { Y[4-7] }
- psrad m0, 15
- psrad m2, 15
+ psrad m0, 9
+ psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
- packuswb m0, m0 ; (byte) { Y[0-7] }
- movh [dstq+wq], m0
- add wq, mmsize / 2
+ mova [dstq+wq], m0
+ add wq, mmsize
jl .loop
+ sub wq, mmsize - 1
+ jz .end
+ add srcq, 2*mmsize - 2
+ add dstq, mmsize - 1
+.loop2:
+ movd m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
+ pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
+ pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
+ paddd m0, m4 ; += rgb_Yrnd
+ paddd m0, m1 ; (dword) { Y[0-3] }
+ psrad m0, 9
+ packssdw m0, m0 ; (word) { Y[0-7] }
+ movd [dstq+wq], m0
+ add wq, 2
+ jl .loop2
+.end:
REP_RET
%endif ; %0 == 3
%endmacro
@@ -354,7 +398,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_UV_FN 5-6
-cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [rgba_Ucoeff_%2%4]
mova m9, [rgba_Ucoeff_%3%5]
@@ -375,21 +419,23 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
%else ; ARCH_X86_64 && %0 == 6
.body:
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
+ add wq, wq
+ sub wq, mmsize - 1
add dstUq, wq
add dstVq, wq
- lea srcq, [srcq+wq*4]
+ lea srcq, [srcq+wq*2]
neg wq
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
mova m6, [rgb_UVrnd]
.loop:
; FIXME check alignment and use mova
- movu m0, [srcq+wq*4+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
- movu m4, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+ movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
@@ -405,26 +451,48 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
paddd m3, m6 ; += rgb_UVrnd
paddd m5, m6 ; += rgb_UVrnd
- psrad m0, 15
+ psrad m0, 9
paddd m1, m3 ; (dword) { V[4-7] }
paddd m4, m5 ; (dword) { U[4-7] }
- psrad m2, 15
- psrad m4, 15
- psrad m1, 15
+ psrad m2, 9
+ psrad m4, 9
+ psrad m1, 9
packssdw m0, m4 ; (word) { U[0-7] }
packssdw m2, m1 ; (word) { V[0-7] }
%if mmsize == 8
- packuswb m0, m0 ; (byte) { U[0-7] }
- packuswb m2, m2 ; (byte) { V[0-7] }
- movh [dstUq+wq], m0
- movh [dstVq+wq], m2
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%else ; mmsize == 16
- packuswb m0, m2 ; (byte) { U[0-7], V[0-7] }
- movh [dstUq+wq], m0
- movhps [dstVq+wq], m0
+ mova [dstUq+wq], m0
+ mova [dstVq+wq], m2
%endif ; mmsize == 8/16
- add wq, mmsize / 2
+ add wq, mmsize
jl .loop
+ sub wq, mmsize - 1
+ jz .end
+ add srcq , 2*mmsize - 2
+ add dstUq, mmsize - 1
+ add dstVq, mmsize - 1
+.loop2:
+ movd m0, [srcq+wq*2] ; (byte) { Bx, Gx, Rx, xx }[0-3]
+ DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
+ pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
+ pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
+ pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
+ pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
+ paddd m3, m6 ; += rgb_UVrnd
+ paddd m1, m6 ; += rgb_UVrnd
+ paddd m2, m3 ; (dword) { V[0-3] }
+ paddd m0, m1 ; (dword) { U[0-3] }
+ psrad m0, 9
+ psrad m2, 9
+ packssdw m0, m0 ; (word) { U[0-7] }
+ packssdw m2, m2 ; (word) { V[0-7] }
+ movd [dstUq+wq], m0
+ movd [dstVq+wq], m2
+ add wq, 2
+ jl .loop2
+.end:
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
%endmacro
@@ -451,8 +519,10 @@ RGB32_FUNCS 0, 0
INIT_XMM sse2
RGB32_FUNCS 8, 12
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB32_FUNCS 8, 12
+%endif
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
@@ -489,7 +559,7 @@ RGB32_FUNCS 8, 12
; will be the same (i.e. YUYV+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_Y_FN 2-3
-cglobal %2ToY, 3, 3, %1, dst, src, w
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
@@ -559,11 +629,11 @@ cglobal %2ToY, 3, 3, %1, dst, src, w
; will be the same (i.e. UYVY+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_UV_FN 2-3
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
@@ -593,8 +663,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
- pand m2, m0, m4 ; (word) { U0, U1, ..., U7 }
- pand m3, m1, m4 ; (word) { U8, U9, ..., U15 }
+ pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
+ pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
packuswb m2, m3 ; (byte) { U0, ..., U15 }
@@ -614,11 +684,11 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
; %1 = nr. of XMM registers
; %2 = nv12 or nv21
%macro NVXX_TO_UV_FN 2
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
- movsxd wq, dword r4m
+ movsxd wq, dword r5m
%else ; x86-32
- mov wq, r4m
+ mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
@@ -626,8 +696,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
- pcmpeqb m4, m4 ; (byte) { 0xff } x 16
- psrlw m4, 8 ; (word) { 0x00ff } x 8
+ pcmpeqb m5, m5 ; (byte) { 0xff } x 16
+ psrlw m5, 8 ; (word) { 0x00ff } x 8
%if mmsize == 16
jnz .loop_u_start
neg wq
@@ -659,6 +729,7 @@ YUYV_TO_UV_FN 3, uyvy
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
; that's not faster in practice
@@ -666,3 +737,4 @@ YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy, 1
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
+%endif
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index 71cd72edf4..db3e9934f8 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;* Kieran Kunhya <kieran@kunhya.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -54,76 +54,8 @@ SECTION .text
; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple
; of 2. $offset is either 0 or 3. $dither holds 8 values.
;-----------------------------------------------------------------------------
-
-%macro yuv2planeX_fn 3
-
-%if ARCH_X86_32
-%define cntr_reg fltsizeq
-%define movsx mov
-%else
-%define cntr_reg r7
-%define movsx movsxd
-%endif
-
-cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
-%if %1 == 8 || %1 == 9 || %1 == 10
- pxor m6, m6
-%endif ; %1 == 8/9/10
-
-%if %1 == 8
-%if ARCH_X86_32
-%assign pad 0x2c - (stack_offset & 15)
- SUB rsp, pad
-%define m_dith m7
-%else ; x86-64
-%define m_dith m9
-%endif ; x86-32
-
- ; create registers holding dither
- movq m_dith, [ditherq] ; dither
- test offsetd, offsetd
- jz .no_rot
-%if mmsize == 16
- punpcklqdq m_dith, m_dith
-%endif ; mmsize == 16
- PALIGNR m_dith, m_dith, 3, m0
-.no_rot:
-%if mmsize == 16
- punpcklbw m_dith, m6
-%if ARCH_X86_64
- punpcklwd m8, m_dith, m6
- pslld m8, 12
-%else ; x86-32
- punpcklwd m5, m_dith, m6
- pslld m5, 12
-%endif ; x86-32/64
- punpckhwd m_dith, m6
- pslld m_dith, 12
-%if ARCH_X86_32
- mova [rsp+ 0], m5
- mova [rsp+16], m_dith
-%endif
-%else ; mmsize == 8
- punpcklbw m5, m_dith, m6
- punpckhbw m_dith, m6
- punpcklwd m4, m5, m6
- punpckhwd m5, m6
- punpcklwd m3, m_dith, m6
- punpckhwd m_dith, m6
- pslld m4, 12
- pslld m5, 12
- pslld m3, 12
- pslld m_dith, 12
- mova [rsp+ 0], m4
- mova [rsp+ 8], m5
- mova [rsp+16], m3
- mova [rsp+24], m_dith
-%endif ; mmsize == 8/16
-%endif ; %1 == 8
-
- xor r5, r5
-
-.pixelloop:
+%macro yuv2planeX_mainloop 2
+.pixelloop_%2:
%assign %%i 0
; the rep here is for the 8-bit output MMX case, where dither covers
; 8 pixels but we can only handle 2 pixels per register, and thus 4
@@ -150,7 +82,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
mova m2, m1
%endif ; %1 == 8/9/10/16
movsx cntr_reg, fltsizem
-.filterloop_ %+ %%i:
+.filterloop_%2_ %+ %%i:
; input pixels
mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
%if %1 == 16
@@ -197,7 +129,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%endif ; %1 == 8/9/10/16
sub cntr_reg, 2
- jg .filterloop_ %+ %%i
+ jg .filterloop_%2_ %+ %%i
%if %1 == 16
psrad m2, 31 - %1
@@ -224,7 +156,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%endif ; mmxext/sse2/sse4/avx
pminsw m2, [yuv2yuvX_%1_upper]
%endif ; %1 == 9/10/16
- mova [dstq+r5*2], m2
+ mov%2 [dstq+r5*2], m2
%endif ; %1 == 8/9/10/16
add r5, mmsize/2
@@ -232,7 +164,87 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%assign %%i %%i+2
%endrep
- jg .pixelloop
+ jg .pixelloop_%2
+%endmacro
+
+%macro yuv2planeX_fn 3
+
+%if ARCH_X86_32
+%define cntr_reg fltsizeq
+%define movsx mov
+%else
+%define cntr_reg r7
+%define movsx movsxd
+%endif
+
+cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
+%if %1 == 8 || %1 == 9 || %1 == 10
+ pxor m6, m6
+%endif ; %1 == 8/9/10
+
+%if %1 == 8
+%if ARCH_X86_32
+%assign pad 0x2c - (stack_offset & 15)
+ SUB rsp, pad
+%define m_dith m7
+%else ; x86-64
+%define m_dith m9
+%endif ; x86-32
+
+ ; create registers holding dither
+ movq m_dith, [ditherq] ; dither
+ test offsetd, offsetd
+ jz .no_rot
+%if mmsize == 16
+ punpcklqdq m_dith, m_dith
+%endif ; mmsize == 16
+ PALIGNR m_dith, m_dith, 3, m0
+.no_rot:
+%if mmsize == 16
+ punpcklbw m_dith, m6
+%if ARCH_X86_64
+ punpcklwd m8, m_dith, m6
+ pslld m8, 12
+%else ; x86-32
+ punpcklwd m5, m_dith, m6
+ pslld m5, 12
+%endif ; x86-32/64
+ punpckhwd m_dith, m6
+ pslld m_dith, 12
+%if ARCH_X86_32
+ mova [rsp+ 0], m5
+ mova [rsp+16], m_dith
+%endif
+%else ; mmsize == 8
+ punpcklbw m5, m_dith, m6
+ punpckhbw m_dith, m6
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m3, m_dith, m6
+ punpckhwd m_dith, m6
+ pslld m4, 12
+ pslld m5, 12
+ pslld m3, 12
+ pslld m_dith, 12
+ mova [rsp+ 0], m4
+ mova [rsp+ 8], m5
+ mova [rsp+16], m3
+ mova [rsp+24], m_dith
+%endif ; mmsize == 8/16
+%endif ; %1 == 8
+
+ xor r5, r5
+
+%if mmsize == 8 || %1 == 8
+ yuv2planeX_mainloop %1, a
+%else ; mmsize == 16
+ test dstq, 15
+ jnz .unaligned
+ yuv2planeX_mainloop %1, a
+ REP_RET
+.unaligned:
+ yuv2planeX_mainloop %1, u
+%endif ; mmsize == 8/16
%if %1 == 8
%if ARCH_X86_32
@@ -264,10 +276,12 @@ yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
yuv2planeX_fn 16, 8, 5
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
+%endif
; %1=outout-bpc, %2=alignment (u/a)
%macro yuv2plane1_mainloop 2
@@ -402,8 +416,10 @@ yuv2plane1_fn 16, 6, 3
INIT_XMM sse4
yuv2plane1_fn 16, 5, 3
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 5, 3
+%endif
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 9cfe831e3c..b80e869e0c 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -6,20 +6,20 @@
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -72,8 +72,14 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
+
+DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset);
+DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111);
+DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
-#define RGB2YUV_SHIFT 8
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
@@ -125,6 +131,7 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
#undef COMPILE_TEMPLATE_AMD3DNOW
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_AVX 0
#define COMPILE_TEMPLATE_AMD3DNOW 1
#define RENAME(a) a ## _3dnow
#include "rgb2rgb_template.c"
diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c
index a67a8a6029..95d4f8fd8a 100644
--- a/libswscale/x86/rgb2rgb_template.c
+++ b/libswscale/x86/rgb2rgb_template.c
@@ -7,20 +7,20 @@
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byte order fixes by Alex Beregszaszi
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -131,14 +131,11 @@ static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int sr
"movq %%mm4, %%mm3 \n\t" \
"psllq $48, %%mm2 \n\t" \
"psllq $32, %%mm3 \n\t" \
- "pand "MANGLE(mask24hh)", %%mm2\n\t" \
- "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
"por %%mm2, %%mm0 \n\t" \
"psrlq $16, %%mm1 \n\t" \
"psrlq $32, %%mm4 \n\t" \
"psllq $16, %%mm5 \n\t" \
"por %%mm3, %%mm1 \n\t" \
- "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
"por %%mm5, %%mm4 \n\t" \
\
MOVNTQ" %%mm0, (%0) \n\t" \
@@ -168,6 +165,7 @@ static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"movq %%mm5, %%mm7 \n\t"
STORE_BGR24_MMX
:: "r"(dest), "r"(s)
+ NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
:"memory");
dest += 24;
s += 32;
@@ -717,27 +715,6 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_s
}
}
-/*
- I use less accurate approximation here by simply left-shifting the input
- value and filling the low order bits with zeroes. This method improves PNG
- compression but this scheme cannot reproduce white exactly, since it does
- not generate an all-ones maximum value; the net effect is to darken the
- image slightly.
-
- The better method should be "left bit replication":
-
- 4 3 2 1 0
- ---------
- 1 1 0 1 1
-
- 7 6 5 4 3 2 1 0
- ----------------
- 1 1 0 1 1 1 1 0
- |=======| |===|
- | leftmost bits repeated to fill open bits
- |
- original bits
-*/
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint16_t *end;
@@ -756,9 +733,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $2, %%mm1 \n\t"
- "psrlq $7, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -786,9 +764,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $2, %%mm1 \n\t"
- "psrlq $7, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -809,6 +788,7 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
:"=m"(*d)
:"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
+ NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
:"memory");
/* borrowed 32 to 24 */
__asm__ volatile(
@@ -825,6 +805,7 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
STORE_BGR24_MMX
:: "r"(d), "m"(*s)
+ NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
:"memory");
d += 24;
s += 8;
@@ -834,9 +815,9 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x3E0)>>2;
- *d++ = (bgr&0x7C00)>>7;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
}
}
@@ -858,9 +839,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $3, %%mm1 \n\t"
- "psrlq $8, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -888,9 +871,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $3, %%mm1 \n\t"
- "psrlq $8, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
@@ -910,6 +895,7 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"por %%mm5, %%mm3 \n\t"
:"=m"(*d)
:"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
+ NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
:"memory");
/* borrowed 32 to 24 */
__asm__ volatile(
@@ -926,6 +912,7 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
STORE_BGR24_MMX
:: "r"(d), "m"(*s)
+ NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
:"memory");
d += 24;
s += 8;
@@ -935,9 +922,9 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x7E0)>>3;
- *d++ = (bgr&0xF800)>>8;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
}
}
@@ -980,11 +967,13 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $2, %%mm1 \n\t"
- "psrlq $7, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "pmulhw %5, %%mm0 \n\t"
+ "pmulhw %5, %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
PACK_RGB32
- ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
+ ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
+ NAMED_CONSTRAINTS_ADD(mul15_hi)
:"memory");
d += 16;
s += 4;
@@ -994,9 +983,9 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x3E0)>>2;
- *d++ = (bgr&0x7C00)>>7;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+ *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = 255;
}
}
@@ -1021,11 +1010,14 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
- "psllq $3, %%mm0 \n\t"
- "psrlq $3, %%mm1 \n\t"
- "psrlq $8, %%mm2 \n\t"
+ "psllq $5, %%mm0 \n\t"
+ "psrlq $1, %%mm2 \n\t"
+ "pmulhw %5, %%mm0 \n\t"
+ "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
+ "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
PACK_RGB32
- ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
+ ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
+ NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
:"memory");
d += 16;
s += 4;
@@ -1035,9 +1027,9 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
while (s < end) {
register uint16_t bgr;
bgr = *s++;
- *d++ = (bgr&0x1F)<<3;
- *d++ = (bgr&0x7E0)>>3;
- *d++ = (bgr&0xF800)>>8;
+ *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+ *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+ *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = 255;
}
}
@@ -1098,7 +1090,7 @@ static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst,
: "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
: "memory");
for (; idx<15; idx+=4) {
- register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
+ register unsigned v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
v &= 0xff00ff;
*(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
}
@@ -1150,6 +1142,7 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int sr
"2: \n\t"
: "+a" (mmx_size)
: "r" (src-mmx_size), "r"(dst-mmx_size)
+ NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
);
__asm__ volatile(SFENCE:::"memory");
@@ -1441,7 +1434,9 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWid
dst+= dstStride;
for (y=1; y<srcHeight; y++) {
- const x86_reg mmxSize= srcWidth&~15;
+ x86_reg mmxSize= srcWidth&~15;
+
+ if (mmxSize) {
__asm__ volatile(
"mov %4, %%"REG_a" \n\t"
"movq "MANGLE(mmx_ff)", %%mm0 \n\t"
@@ -1485,8 +1480,14 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWid
:: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
"g" (-mmxSize)
+ NAMED_CONSTRAINTS_ADD(mmx_ff)
: "%"REG_a
);
+ } else {
+ mmxSize = 1;
+ dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
+ dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
+ }
for (x=mmxSize-1; x<srcWidth-1; x++) {
dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
@@ -1629,18 +1630,33 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
* others are ignored in the C version.
* FIXME: Write HQ version.
*/
+#if HAVE_7REGS
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
- int lumStride, int chromStride, int srcStride)
+ int lumStride, int chromStride, int srcStride,
+ int32_t *rgb2yuv)
{
+#define BGR2Y_IDX "16*4+16*32"
+#define BGR2U_IDX "16*4+16*33"
+#define BGR2V_IDX "16*4+16*34"
int y;
const x86_reg chromWidth= width>>1;
+
+ if (height > 2) {
+ ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
+ src += 2*srcStride;
+ ydst += 2*lumStride;
+ udst += chromStride;
+ vdst += chromStride;
+ height -= 2;
+ }
+
for (y=0; y<height-2; y+=2) {
int i;
for (i=0; i<2; i++) {
__asm__ volatile(
"mov %2, %%"REG_a" \n\t"
- "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
+ "movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
@@ -1659,12 +1675,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
-#ifndef FAST_BGR2YV12
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
-#endif
"packssdw %%mm1, %%mm0 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
@@ -1684,12 +1698,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
-#ifndef FAST_BGR2YV12
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
-#endif
"packssdw %%mm1, %%mm4 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
@@ -1704,7 +1716,8 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
"add $8, %%"REG_a" \n\t"
" js 1b \n\t"
- : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
+ : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
+ NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
: "%"REG_a, "%"REG_d
);
ydst += lumStride;
@@ -1714,7 +1727,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
__asm__ volatile(
"mov %4, %%"REG_a" \n\t"
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
- "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
+ "movq "BGR2U_IDX"(%5), %%mm6 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
"add %%"REG_d", %%"REG_d" \n\t"
@@ -1763,19 +1776,17 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
+ "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
+ "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
-#ifndef FAST_BGR2YV12
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
-#endif
"packssdw %%mm2, %%mm0 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
@@ -1825,19 +1836,17 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
- "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
+ "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
+ "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
"pmaddwd %%mm4, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
-#ifndef FAST_BGR2YV12
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
-#endif
"packssdw %%mm2, %%mm4 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
@@ -1856,7 +1865,8 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
"movd %%mm0, (%3, %%"REG_a") \n\t"
"add $4, %%"REG_a" \n\t"
" js 1b \n\t"
- : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
+ : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
+ NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
: "%"REG_a, "%"REG_d
);
@@ -1869,8 +1879,9 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
SFENCE" \n\t"
:::"memory");
- rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
+ ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
}
+#endif /* HAVE_7REGS */
#endif /* !COMPILE_TEMPLATE_SSE2 */
#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
@@ -1883,7 +1894,9 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui
for (h=0; h < height; h++) {
int w;
+ if (width >= 16) {
#if COMPILE_TEMPLATE_SSE2
+ if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
__asm__(
"xor %%"REG_a", %%"REG_a" \n\t"
"1: \n\t"
@@ -1900,9 +1913,10 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui
"cmp %3, %%"REG_a" \n\t"
" jb 1b \n\t"
::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
- : "memory", "%"REG_a""
+ : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"REG_a
);
-#else
+ } else
+#endif
__asm__(
"xor %%"REG_a", %%"REG_a" \n\t"
"1: \n\t"
@@ -1928,7 +1942,8 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui
::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
: "memory", "%"REG_a
);
-#endif
+
+ }
for (w= (width&(~15)); w < width; w++) {
dest[2*w+0] = src1[w];
dest[2*w+1] = src2[w];
@@ -1945,9 +1960,13 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
+#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
- const uint8_t *src, const uint8_t *unused, int w,
+ const uint8_t *unused,
+ const uint8_t *src1,
+ const uint8_t *src2,
+ int w,
uint32_t *unused2);
static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
@@ -1956,18 +1975,21 @@ static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t
int h;
for (h = 0; h < height; h++) {
- RENAME(ff_nv12ToUV)(dst1, dst2, src, NULL, width, NULL);
+ RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL);
src += srcStride;
dst1 += dst1Stride;
dst2 += dst2Stride;
}
__asm__(
+#if !COMPILE_TEMPLATE_SSE2
EMMS" \n\t"
+#endif
SFENCE" \n\t"
::: "memory"
);
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */
#if !COMPILE_TEMPLATE_SSE2
#if !COMPILE_TEMPLATE_AMD3DNOW
@@ -2187,6 +2209,44 @@ static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count
}
}
+static void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count)
+{
+ src ++;
+ dst += count;
+ src += 2*count;
+ count= - count;
+
+ if(count < -16) {
+ count += 16;
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $8, %%mm7 \n\t"
+ "1: \n\t"
+ "movq -32(%1, %0, 2), %%mm0 \n\t"
+ "movq -24(%1, %0, 2), %%mm1 \n\t"
+ "movq -16(%1, %0, 2), %%mm2 \n\t"
+ "movq -8(%1, %0, 2), %%mm3 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ MOVNTQ" %%mm0,-16(%2, %0) \n\t"
+ MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
+ "add $16, %0 \n\t"
+ " js 1b \n\t"
+ : "+r"(count)
+ : "r"(src), "r"(dst)
+ );
+ count -= 16;
+ }
+ while(count<0) {
+ dst[count]= src[2*count];
+ count++;
+ }
+}
+
#if !COMPILE_TEMPLATE_AMD3DNOW
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
{
@@ -2450,7 +2510,7 @@ static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
const int chromWidth = AV_CEIL_RSHIFT(width, 1);
for (y=0; y<height; y++) {
- RENAME(extract_even)(src+1, ydst, width);
+ RENAME(extract_odd)(src, ydst, width);
if(y&1) {
RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
udst+= chromStride;
@@ -2476,7 +2536,7 @@ static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, co
const int chromWidth = AV_CEIL_RSHIFT(width, 1);
for (y=0; y<height; y++) {
- RENAME(extract_even)(src+1, ydst, width);
+ RENAME(extract_odd)(src, ydst, width);
RENAME(extract_even2)(src, udst, vdst, chromWidth);
src += srcStride;
@@ -2529,7 +2589,9 @@ static av_cold void RENAME(rgb2rgb_init)(void)
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
planar2x = RENAME(planar2x);
#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
- rgb24toyv12 = RENAME(rgb24toyv12);
+#if HAVE_7REGS
+ ff_rgb24toyv12 = RENAME(rgb24toyv12);
+#endif /* HAVE_7REGS */
yuyvtoyuv420 = RENAME(yuyvtoyuv420);
uyvytoyuv420 = RENAME(uyvytoyuv420);
@@ -2538,7 +2600,9 @@ static av_cold void RENAME(rgb2rgb_init)(void)
#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
interleaveBytes = RENAME(interleaveBytes);
#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
+#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
deinterleaveBytes = RENAME(deinterleaveBytes);
#endif
+#endif
}
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index a4c7e1c00b..f9781703a9 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -2,20 +2,20 @@
;* x86-optimized horizontal line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -407,11 +407,15 @@ SCALE_FUNC %1, %2, X, X8, 7, %3
SCALE_FUNCS 8, 15, %1
SCALE_FUNCS 9, 15, %2
SCALE_FUNCS 10, 15, %2
+SCALE_FUNCS 12, 15, %2
+SCALE_FUNCS 14, 15, %2
SCALE_FUNCS 16, 15, %3
%endif ; !sse4
SCALE_FUNCS 8, 19, %1
SCALE_FUNCS 9, 19, %2
SCALE_FUNCS 10, 19, %2
+SCALE_FUNCS 12, 19, %2
+SCALE_FUNCS 14, 19, %2
SCALE_FUNCS 16, 19, %3
%endmacro
@@ -420,7 +424,7 @@ INIT_MMX mmx
SCALE_FUNCS2 0, 0, 0
%endif
INIT_XMM sse2
-SCALE_FUNCS2 6, 7, 8
+SCALE_FUNCS2 7, 6, 8
INIT_XMM ssse3
SCALE_FUNCS2 6, 6, 8
INIT_XMM sse4
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index f310a7593f..3b370be662 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -1,20 +1,20 @@
/*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -23,6 +23,7 @@
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
@@ -57,19 +58,11 @@ DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL;
-#ifdef FAST_BGR2YV12
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000000210041000DULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000FFEEFFDC0038ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00000038FFD2FFF8ULL;
-#else
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL;
-#endif /* FAST_BGR2YV12 */
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
+
//MMX versions
#if HAVE_MMX_INLINE
#undef RENAME
@@ -87,16 +80,17 @@ DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
#include "swscale_template.c"
#endif
-void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
+void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
int lastInLumBuf, int lastInChrBuf)
{
const int dstH= c->dstH;
const int flags= c->flags;
- int16_t **lumPixBuf= c->lumPixBuf;
- int16_t **chrUPixBuf= c->chrUPixBuf;
- int16_t **alpPixBuf= c->alpPixBuf;
- const int vLumBufSize= c->vLumBufSize;
- const int vChrBufSize= c->vChrBufSize;
+
+ SwsPlane *lumPlane = &c->slice[c->numSlice-2].plane[0];
+ SwsPlane *chrUPlane = &c->slice[c->numSlice-2].plane[1];
+ SwsPlane *alpPlane = &c->slice[c->numSlice-2].plane[3];
+
+ int hasAlpha = c->needAlpha;
int32_t *vLumFilterPos= c->vLumFilterPos;
int32_t *vChrFilterPos= c->vChrFilterPos;
int16_t *vLumFilter= c->vLumFilter;
@@ -117,13 +111,14 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
c->greenDither= ff_dither4[dstY&1];
c->redDither= ff_dither8[(dstY+1)&1];
if (dstY < dstH - 2) {
- const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
- const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
- const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
- int i;
+ const int16_t **lumSrcPtr = (const int16_t **)(void*) lumPlane->line + firstLumSrcY - lumPlane->sliceY;
+ const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPlane->line + firstChrSrcY - chrUPlane->sliceY;
+ const int16_t **alpSrcPtr = (CONFIG_SWSCALE_ALPHA && hasAlpha) ? (const int16_t **)(void*) alpPlane->line + firstLumSrcY - alpPlane->sliceY : NULL;
+ int i;
if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
- const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
+ const int16_t **tmpY = (const int16_t **) lumPlane->tmp;
+
int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
for (i = 0; i < neg; i++)
tmpY[i] = lumSrcPtr[neg];
@@ -134,7 +129,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
lumSrcPtr = tmpY;
if (alpSrcPtr) {
- const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
+ const int16_t **tmpA = (const int16_t **) alpPlane->tmp;
for (i = 0; i < neg; i++)
tmpA[i] = alpSrcPtr[neg];
for ( ; i < end; i++)
@@ -145,7 +140,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
}
}
if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
- const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
+ const int16_t **tmpU = (const int16_t **) chrUPlane->tmp;
int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
for (i = 0; i < neg; i++) {
tmpU[i] = chrUSrcPtr[neg];
@@ -167,7 +162,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
lumMmxFilter[s*i+APCK_COEF/4 ]=
lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
+ (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
- if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
*(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
*(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
alpMmxFilter[s*i+APCK_COEF/4 ]=
@@ -186,8 +181,8 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
*(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
lumMmxFilter[4*i+2]=
lumMmxFilter[4*i+3]=
- ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
- if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+ ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
+ if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
*(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
alpMmxFilter[4*i+2]=
alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
@@ -197,12 +192,90 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
*(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
chrMmxFilter[4*i+2]=
chrMmxFilter[4*i+3]=
- ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
+ ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
}
}
}
}
+#if HAVE_MMXEXT
+static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ if(((uintptr_t)dest) & 15){
+ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
+ return;
+ }
+ filterSize--;
+#define MAIN_FUNCTION \
+ "pxor %%xmm0, %%xmm0 \n\t" \
+ "punpcklbw %%xmm0, %%xmm3 \n\t" \
+ "movd %4, %%xmm1 \n\t" \
+ "punpcklwd %%xmm1, %%xmm1 \n\t" \
+ "punpckldq %%xmm1, %%xmm1 \n\t" \
+ "punpcklqdq %%xmm1, %%xmm1 \n\t" \
+ "psllw $3, %%xmm1 \n\t" \
+ "paddw %%xmm1, %%xmm3 \n\t" \
+ "psraw $4, %%xmm3 \n\t" \
+ "movdqa %%xmm3, %%xmm4 \n\t" \
+ "movdqa %%xmm3, %%xmm7 \n\t" \
+ "movl %3, %%ecx \n\t" \
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ ".p2align 4 \n\t" /* FIXME Unroll? */\
+ "1: \n\t"\
+ "movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\
+ "movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\
+ "movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\
+ "add $16, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ "pmulhw %%xmm0, %%xmm2 \n\t"\
+ "pmulhw %%xmm0, %%xmm5 \n\t"\
+ "paddw %%xmm2, %%xmm3 \n\t"\
+ "paddw %%xmm5, %%xmm4 \n\t"\
+ " jnz 1b \n\t"\
+ "psraw $3, %%xmm3 \n\t"\
+ "psraw $3, %%xmm4 \n\t"\
+ "packuswb %%xmm4, %%xmm3 \n\t"\
+ "movntdq %%xmm3, (%1, %%"REG_c")\n\t"\
+ "add $16, %%"REG_c" \n\t"\
+ "cmp %2, %%"REG_c" \n\t"\
+ "movdqa %%xmm7, %%xmm3 \n\t" \
+ "movdqa %%xmm7, %%xmm4 \n\t" \
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "jb 1b \n\t"
+
+ if (offset) {
+ __asm__ volatile(
+ "movq %5, %%xmm3 \n\t"
+ "movdqa %%xmm3, %%xmm4 \n\t"
+ "psrlq $24, %%xmm3 \n\t"
+ "psllq $40, %%xmm4 \n\t"
+ "por %%xmm4, %%xmm3 \n\t"
+ MAIN_FUNCTION
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
+ "m"(filterSize), "m"(((uint64_t *) dither)[0])
+ : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
+ "%"REG_d, "%"REG_S, "%"REG_c
+ );
+ } else {
+ __asm__ volatile(
+ "movq %5, %%xmm3 \n\t"
+ MAIN_FUNCTION
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
+ "m"(filterSize), "m"(((uint64_t *) dither)[0])
+ : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
+ "%"REG_d, "%"REG_S, "%"REG_c
+ );
+ }
+}
+#endif
+
#endif /* HAVE_INLINE_ASM */
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
@@ -216,10 +289,14 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
SCALE_FUNC(filter_n, 8, 15, opt); \
SCALE_FUNC(filter_n, 9, 15, opt); \
SCALE_FUNC(filter_n, 10, 15, opt); \
+ SCALE_FUNC(filter_n, 12, 15, opt); \
+ SCALE_FUNC(filter_n, 14, 15, opt); \
SCALE_FUNC(filter_n, 16, 15, opt); \
SCALE_FUNC(filter_n, 8, 19, opt); \
SCALE_FUNC(filter_n, 9, 19, opt); \
SCALE_FUNC(filter_n, 10, 19, opt); \
+ SCALE_FUNC(filter_n, 12, 19, opt); \
+ SCALE_FUNC(filter_n, 14, 19, opt); \
SCALE_FUNC(filter_n, 16, 19, opt)
#define SCALE_FUNCS_MMX(opt) \
@@ -275,11 +352,14 @@ VSCALE_FUNCS(avx, avx);
#define INPUT_Y_FUNC(fmt, opt) \
void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
+ const uint8_t *unused1, const uint8_t *unused2, \
int w, uint32_t *unused)
#define INPUT_UV_FUNC(fmt, opt) \
void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
- const uint8_t *src, const uint8_t *unused1, \
- int w, uint32_t *unused2)
+ const uint8_t *unused0, \
+ const uint8_t *src1, \
+ const uint8_t *src2, \
+ int w, uint32_t *unused)
#define INPUT_FUNC(fmt, opt) \
INPUT_Y_FUNC(fmt, opt); \
INPUT_UV_FUNC(fmt, opt)
@@ -313,20 +393,31 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
sws_init_swscale_mmxext(c);
+ if (cpu_flags & AV_CPU_FLAG_SSE3){
+ if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
+ c->yuv2planeX = yuv2yuvX_sse3;
+ }
#endif
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
if (c->srcBpc == 8) { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
ff_hscale8to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 9) { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
ff_hscale9to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 10) { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
ff_hscale10to19_ ## filtersize ## _ ## opt1; \
- } else /* c->srcBpc == 16 */ { \
- hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
+ } else if (c->srcBpc == 12) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale12to19_ ## filtersize ## _ ## opt1; \
+ } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth<16)) { \
+ hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
+ ff_hscale14to19_ ## filtersize ## _ ## opt1; \
+ } else { /* c->srcBpc == 16 */ \
+ av_assert0(c->srcBpc == 16);\
+ hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
} \
} while (0)
@@ -341,14 +432,15 @@ switch(c->dstBpc){ \
case 16: do_16_case; break; \
case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \
- default: if (condition_8bit) vscalefn = ff_yuv2planeX_8_ ## opt; break; \
+ case 8: if ((condition_8bit) && !c->use_mmx_vfilter) vscalefn = ff_yuv2planeX_8_ ## opt; break; \
}
#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
switch(c->dstBpc){ \
case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
- default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
+ case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
+ default: av_assert0(c->dstBpc>8); \
}
#define case_rgb(x, X, opt) \
case AV_PIX_FMT_ ## X: \
@@ -365,7 +457,7 @@ switch(c->dstBpc){ \
switch (c->srcFormat) {
case AV_PIX_FMT_YA8:
c->lumToYV12 = ff_yuyvToY_mmx;
- if (c->alpPixBuf)
+ if (c->needAlpha)
c->alpToYV12 = ff_uyvyToY_mmx;
break;
case AV_PIX_FMT_YUYV422:
@@ -414,7 +506,7 @@ switch(c->dstBpc){ \
switch (c->srcFormat) {
case AV_PIX_FMT_YA8:
c->lumToYV12 = ff_yuyvToY_sse2;
- if (c->alpPixBuf)
+ if (c->needAlpha)
c->alpToYV12 = ff_uyvyToY_sse2;
break;
case AV_PIX_FMT_YUYV422:
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 1e42ec5b12..3b38e98974 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -1,20 +1,20 @@
/*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -25,21 +25,101 @@
#undef REAL_MOVNTQ
#undef MOVNTQ
+#undef MOVNTQ2
#undef PREFETCH
-#if COMPILE_TEMPLATE_MMXEXT
-#define PREFETCH "prefetchnta"
-#else
-#define PREFETCH " # nop"
-#endif
#if COMPILE_TEMPLATE_MMXEXT
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movntq "
#else
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movq "
#endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
+#if !COMPILE_TEMPLATE_MMXEXT
+static av_always_inline void
+dither_8to16(const uint8_t *srcDither, int rot)
+{
+ if (rot) {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "psrlq $24, %%mm3\n\t"
+ "psllq $40, %%mm4\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ :: "r"(srcDither)
+ );
+ } else {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ :: "r"(srcDither)
+ );
+ }
+}
+#endif
+
+static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ dither_8to16(dither, offset);
+ filterSize--;
+ __asm__ volatile(
+ "movd %0, %%mm1\n\t"
+ "punpcklwd %%mm1, %%mm1\n\t"
+ "punpckldq %%mm1, %%mm1\n\t"
+ "psllw $3, %%mm1\n\t"
+ "paddw %%mm1, %%mm3\n\t"
+ "paddw %%mm1, %%mm4\n\t"
+ "psraw $4, %%mm3\n\t"
+ "psraw $4, %%mm4\n\t"
+ ::"m"(filterSize)
+ );
+
+ __asm__ volatile(\
+ "movq %%mm3, %%mm6\n\t"
+ "movq %%mm4, %%mm7\n\t"
+ "movl %3, %%ecx\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ ".p2align 4 \n\t" /* FIXME Unroll? */\
+ "1: \n\t"\
+ "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
+ "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
+ "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
+ "add $16, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "test %%"REG_S", %%"REG_S" \n\t"\
+ "pmulhw %%mm0, %%mm2 \n\t"\
+ "pmulhw %%mm0, %%mm5 \n\t"\
+ "paddw %%mm2, %%mm3 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ " jnz 1b \n\t"\
+ "psraw $3, %%mm3 \n\t"\
+ "psraw $3, %%mm4 \n\t"\
+ "packuswb %%mm4, %%mm3 \n\t"
+ MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
+ "add $8, %%"REG_c" \n\t"\
+ "cmp %2, %%"REG_c" \n\t"\
+ "movq %%mm6, %%mm3\n\t"
+ "movq %%mm7, %%mm4\n\t"
+ "mov %0, %%"REG_d" \n\t"\
+ "mov (%%"REG_d"), %%"REG_S" \n\t"\
+ "jb 1b \n\t"\
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+ : "%"REG_d, "%"REG_S, "%"REG_c
+ );
+}
+
#define YSCALEYUV2PACKEDX_UV \
__asm__ volatile(\
"xor %%"REG_a", %%"REG_a" \n\t"\
@@ -92,6 +172,7 @@
:: "r" (&c->redDither), \
"m" (dummy), "m" (dummy), "m" (dummy),\
"r" (dest), "m" (dstW_reg), "m"(uv_off) \
+ NAMED_CONSTRAINTS_ADD(bF8,bFC) \
: "%"REG_a, "%"REG_d, "%"REG_S \
);
@@ -252,7 +333,7 @@
MOVNTQ( q3, 24(dst, index, 4))\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
@@ -265,9 +346,9 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"movq %%mm2, "U_TEMP"(%0) \n\t"
@@ -278,13 +359,13 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
- WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
+ WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
@@ -298,22 +379,51 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
+
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
+ YSCALEYUV2PACKEDX
+ YSCALEYUV2RGBX
+ YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
+ "psraw $3, %%mm1 \n\t"
+ "psraw $3, %%mm7 \n\t"
+ "packuswb %%mm7, %%mm1 \n\t"
+ WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ YSCALEYUV2PACKEDX_END
+ } else {
+ YSCALEYUV2PACKEDX
+ YSCALEYUV2RGBX
+ "pcmpeqd %%mm7, %%mm7 \n\t"
+ WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ YSCALEYUV2PACKEDX_END
+ }
+}
+
+static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
- WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ WRITEBGR32(%4, "%5", %%REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%4, "%5", %%REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
@@ -342,7 +452,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
@@ -355,7 +465,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
@@ -366,7 +476,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
- WRITERGB16(%4, %5, %%REGa)
+ WRITERGB16(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
@@ -379,7 +489,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
@@ -390,7 +500,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
#endif
- WRITERGB16(%4, %5, %%REGa)
+ WRITERGB16(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
@@ -419,7 +529,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
@@ -432,7 +542,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
@@ -443,7 +553,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
- WRITERGB15(%4, %5, %%REGa)
+ WRITERGB15(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
@@ -456,7 +566,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
@@ -467,7 +577,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
#endif
- WRITERGB15(%4, %5, %%REGa)
+ WRITERGB15(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
@@ -521,7 +631,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
"add $24, "#dst" \n\t"\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR24MMXEXT(dst, dstw, index) \
@@ -569,7 +679,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
"add $24, "#dst" \n\t"\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#if COMPILE_TEMPLATE_MMXEXT
@@ -580,6 +690,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
#endif
+#if HAVE_6REGS
static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
@@ -589,17 +700,18 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
"add %4, %%"REG_c" \n\t"
- WRITEBGR24(%%REGc, %5, %%REGa)
+ WRITEBGR24(%%REGc, "%5", %%REGa)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg), "m"(uv_off)
+ NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
);
}
@@ -613,20 +725,22 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
"add %4, %%"REG_c" \n\t"
- WRITEBGR24(%%REGc, %5, %%REGa)
+ WRITEBGR24(%%REGc, "%5", %%REGa)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg), "m"(uv_off)
+ NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
);
}
+#endif /* HAVE_6REGS */
#define REAL_WRITEYUY2(dst, dstw, index) \
"packuswb %%mm3, %%mm3 \n\t"\
@@ -641,7 +755,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
MOVNTQ(%%mm7, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
@@ -654,7 +768,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -662,7 +776,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
- WRITEYUY2(%4, %5, %%REGa)
+ WRITEYUY2(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
@@ -675,7 +789,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -683,7 +797,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
- WRITEYUY2(%4, %5, %%REGa)
+ WRITEYUY2(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
@@ -775,7 +889,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
#if ARCH_X86_64
__asm__ volatile(
@@ -784,15 +898,15 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
"psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"packuswb %%mm7, %%mm1 \n\t"
- WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
"a" (&c->redDither),
"r" (abuf0), "r" (abuf1)
: "%r8"
);
#else
- *(const uint16_t **)(&c->u_temp)=abuf0;
- *(const uint16_t **)(&c->v_temp)=abuf1;
+ c->u_temp=(intptr_t)abuf0;
+ c->v_temp=(intptr_t)abuf1;
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
@@ -808,7 +922,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
"packuswb %%mm7, %%mm1 \n\t"
"pop %1 \n\t"
"pop %0 \n\t"
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -822,7 +936,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5)
"pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -839,18 +953,18 @@ static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
- WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+ WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
);
}
@@ -862,7 +976,6 @@ static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
@@ -875,11 +988,12 @@ static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+ WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8)
);
}
@@ -891,7 +1005,6 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
@@ -904,11 +1017,12 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+ WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8,bFC)
);
}
@@ -960,13 +1074,12 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2PACKED(%%REGBP, %5)
- WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+ WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1102,14 +1215,14 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
const int16_t *ubuf1 = ubuf[0];
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5)
YSCALEYUV2RGB1_ALPHA(%%REGBP)
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1122,7 +1235,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5)
"pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1131,14 +1244,14 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
}
} else {
const int16_t *ubuf1 = ubuf[1];
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5)
YSCALEYUV2RGB1_ALPHA(%%REGBP)
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1151,7 +1264,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5)
"pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1177,11 +1290,12 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
- WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+ WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
);
} else {
const int16_t *ubuf1 = ubuf[1];
@@ -1191,11 +1305,12 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
- WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+ WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
);
}
}
@@ -1222,11 +1337,12 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+ WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8)
);
} else {
const int16_t *ubuf1 = ubuf[1];
@@ -1242,11 +1358,12 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+ WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8)
);
}
}
@@ -1273,11 +1390,12 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+ WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8,bFC)
);
} else {
const int16_t *ubuf1 = ubuf[1];
@@ -1293,11 +1411,12 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+ WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8,bFC)
);
}
}
@@ -1354,7 +1473,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2PACKED1(%%REGBP, %5)
- WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+ WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1367,7 +1486,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2PACKED1b(%%REGBP, %5)
- WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+ WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1375,203 +1494,20 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
);
}
}
-
-#if COMPILE_TEMPLATE_MMXEXT
-static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
- int dstWidth, const uint8_t *src,
- int srcW, int xInc)
-{
- int32_t *filterPos = c->hLumFilterPos;
- int16_t *filter = c->hLumFilter;
- void *mmxextFilterCode = c->lumMmxextFilterCode;
- int i;
-#if defined(PIC)
- uint64_t ebxsave;
-#endif
-#if ARCH_X86_64
- uint64_t retsave;
-#endif
-
- __asm__ volatile(
-#if defined(PIC)
- "mov %%"REG_b", %5 \n\t"
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"REG_a" \n\t"
- "mov %%"REG_a", %6 \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"REG_a" \n\t"
- "mov %%"REG_a", %5 \n\t"
-#endif
-#endif
- "pxor %%mm7, %%mm7 \n\t"
- "mov %0, %%"REG_c" \n\t"
- "mov %1, %%"REG_D" \n\t"
- "mov %2, %%"REG_d" \n\t"
- "mov %3, %%"REG_b" \n\t"
- "xor %%"REG_a", %%"REG_a" \n\t" // i
- PREFETCH" (%%"REG_c") \n\t"
- PREFETCH" 32(%%"REG_c") \n\t"
- PREFETCH" 64(%%"REG_c") \n\t"
-
-#if ARCH_X86_64
-#define CALL_MMXEXT_FILTER_CODE \
- "movl (%%"REG_b"), %%esi \n\t"\
- "call *%4 \n\t"\
- "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
- "add %%"REG_S", %%"REG_c" \n\t"\
- "add %%"REG_a", %%"REG_D" \n\t"\
- "xor %%"REG_a", %%"REG_a" \n\t"\
-
-#else
-#define CALL_MMXEXT_FILTER_CODE \
- "movl (%%"REG_b"), %%esi \n\t"\
- "call *%4 \n\t"\
- "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
- "add %%"REG_a", %%"REG_D" \n\t"\
- "xor %%"REG_a", %%"REG_a" \n\t"\
-
-#endif /* ARCH_X86_64 */
-
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
-
-#if defined(PIC)
- "mov %5, %%"REG_b" \n\t"
-#if ARCH_X86_64
- "mov %6, %%"REG_a" \n\t"
- "mov %%"REG_a", -8(%%rsp) \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov %5, %%"REG_a" \n\t"
- "mov %%"REG_a", -8(%%rsp) \n\t"
-#endif
-#endif
- :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
- "m" (mmxextFilterCode)
-#if defined(PIC)
- ,"m" (ebxsave)
-#endif
-#if ARCH_X86_64
- ,"m"(retsave)
-#endif
- : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
-#if !defined(PIC)
- ,"%"REG_b
-#endif
- );
-
- for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
- dst[i] = src[srcW-1]*128;
-}
-
-static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
- int dstWidth, const uint8_t *src1,
- const uint8_t *src2, int srcW, int xInc)
-{
- int32_t *filterPos = c->hChrFilterPos;
- int16_t *filter = c->hChrFilter;
- void *mmxextFilterCode = c->chrMmxextFilterCode;
- int i;
-#if defined(PIC)
- DECLARE_ALIGNED(8, uint64_t, ebxsave);
-#endif
-#if ARCH_X86_64
- DECLARE_ALIGNED(8, uint64_t, retsave);
-#endif
-
- __asm__ volatile(
-#if defined(PIC)
- "mov %%"REG_b", %7 \n\t"
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"REG_a" \n\t"
- "mov %%"REG_a", %8 \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"REG_a" \n\t"
- "mov %%"REG_a", %7 \n\t"
-#endif
-#endif
- "pxor %%mm7, %%mm7 \n\t"
- "mov %0, %%"REG_c" \n\t"
- "mov %1, %%"REG_D" \n\t"
- "mov %2, %%"REG_d" \n\t"
- "mov %3, %%"REG_b" \n\t"
- "xor %%"REG_a", %%"REG_a" \n\t" // i
- PREFETCH" (%%"REG_c") \n\t"
- PREFETCH" 32(%%"REG_c") \n\t"
- PREFETCH" 64(%%"REG_c") \n\t"
-
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- "xor %%"REG_a", %%"REG_a" \n\t" // i
- "mov %5, %%"REG_c" \n\t" // src
- "mov %6, %%"REG_D" \n\t" // buf2
- PREFETCH" (%%"REG_c") \n\t"
- PREFETCH" 32(%%"REG_c") \n\t"
- PREFETCH" 64(%%"REG_c") \n\t"
-
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
-
-#if defined(PIC)
- "mov %7, %%"REG_b" \n\t"
-#if ARCH_X86_64
- "mov %8, %%"REG_a" \n\t"
- "mov %%"REG_a", -8(%%rsp) \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov %7, %%"REG_a" \n\t"
- "mov %%"REG_a", -8(%%rsp) \n\t"
-#endif
-#endif
- :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
- "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
-#if defined(PIC)
- ,"m" (ebxsave)
-#endif
-#if ARCH_X86_64
- ,"m"(retsave)
-#endif
- : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
-#if !defined(PIC)
- ,"%"REG_b
-#endif
- );
-
- for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
- dst1[i] = src1[srcW-1]*128;
- dst2[i] = src2[srcW-1]*128;
- }
-}
-#endif /* COMPILE_TEMPLATE_MMXEXT */
-
static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
{
enum AVPixelFormat dstFormat = c->dstFormat;
- if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
- dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21) {
- if (!(c->flags & SWS_BITEXACT)) {
+ c->use_mmx_vfilter= 0;
+ if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
+ && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
if (c->flags & SWS_ACCURATE_RND) {
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
+#if HAVE_6REGS
case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
+#endif
case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
@@ -1579,10 +1515,15 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}
}
} else {
+ c->use_mmx_vfilter= 1;
+ c->yuv2planeX = RENAME(yuv2yuvX );
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
+ case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break;
+#if HAVE_6REGS
case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
+#endif
case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
@@ -1590,7 +1531,6 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}
}
}
- }
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
@@ -1619,12 +1559,12 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}
}
- if (c->srcBpc == 8 && c->dstBpc <= 10) {
+ if (c->srcBpc == 8 && c->dstBpc <= 14) {
// Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
#if COMPILE_TEMPLATE_MMXEXT
if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
- c->hyscale_fast = RENAME(hyscale_fast);
- c->hcscale_fast = RENAME(hcscale_fast);
+ c->hyscale_fast = ff_hyscale_fast_mmxext;
+ c->hcscale_fast = ff_hcscale_fast_mmxext;
} else {
#endif /* COMPILE_TEMPLATE_MMXEXT */
c->hyscale_fast = NULL;
diff --git a/libswscale/x86/w64xmmtest.c b/libswscale/x86/w64xmmtest.c
index dd9a2a4378..88143d9687 100644
--- a/libswscale/x86/w64xmmtest.c
+++ b/libswscale/x86/w64xmmtest.c
@@ -2,20 +2,20 @@
* check XMM registers for clobbers on Win64
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
index 68d3bc2fe7..5e2f77c20f 100644
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@@ -7,20 +7,20 @@
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -50,34 +50,30 @@ DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
//MMX versions
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_INLINE && HAVE_6REGS
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _mmx
#include "yuv2rgb_template.c"
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
// MMXEXT versions
-#if HAVE_MMXEXT_INLINE
+#if HAVE_MMXEXT_INLINE && HAVE_6REGS
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "yuv2rgb_template.c"
-#endif /* HAVE_MMXEXT_INLINE */
+#endif /* HAVE_MMXEXT_INLINE && HAVE_6REGS */
#endif /* HAVE_INLINE_ASM */
av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
{
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_INLINE && HAVE_6REGS
int cpu_flags = av_get_cpu_flags();
- if (c->srcFormat != AV_PIX_FMT_YUV420P &&
- c->srcFormat != AV_PIX_FMT_YUVA420P)
- return NULL;
-
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags)) {
switch (c->dstFormat) {
@@ -117,7 +113,7 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
return yuv420_rgb15_mmx;
}
}
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
return NULL;
}
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index 0b9751623e..acb78f520e 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -4,20 +4,20 @@
* Copyright (C) 2001-2007 Michael Niedermayer
* (c) 2010 Konstantin Shishkov
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -48,17 +48,14 @@
if (h_size * depth > FFABS(dstStride[0])) \
h_size -= 8; \
\
- if (c->srcFormat == AV_PIX_FMT_YUV422P) { \
- srcStride[1] *= 2; \
- srcStride[2] *= 2; \
- } \
+ vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
\
__asm__ volatile ("pxor %mm4, %mm4\n\t"); \
for (y = 0; y < srcSliceH; y++) { \
uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
const uint8_t *py = src[0] + y * srcStride[0]; \
- const uint8_t *pu = src[1] + (y >> 1) * srcStride[1]; \
- const uint8_t *pv = src[2] + (y >> 1) * srcStride[2]; \
+ const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
+ const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
x86_reg index = -h_size / 2; \
#define YUV2RGB_INITIAL_LOAD \
@@ -142,10 +139,21 @@
"add $4, %0\n\t" \
"js 1b\n\t" \
+#if COMPILE_TEMPLATE_MMXEXT
+#undef RGB_PACK24_B_OPERANDS
+#define RGB_PACK24_B_OPERANDS NAMED_CONSTRAINTS_ARRAY_ADD(mask1101,mask0110,mask0100,mask0010,mask1001)
+#else
+#undef RGB_PACK24_B_OPERANDS
+#define RGB_PACK24_B_OPERANDS
+#endif
+
#define YUV2RGB_OPERANDS \
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index) \
+ NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_03,pb_07,mmx_redmask,pb_e0) \
+ RGB_PACK24_B_OPERANDS \
+ : "memory" \
); \
} \
@@ -153,6 +161,8 @@
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index), "r" (pa - 2*index) \
+ NAMED_CONSTRAINTS_ADD(mmx_00ffw) \
+ : "memory" \
); \
} \
@@ -193,7 +203,7 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(2)
@@ -221,7 +231,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(2)
@@ -311,7 +321,7 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(3)
@@ -329,7 +339,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(3)
@@ -373,7 +383,7 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)
@@ -394,7 +404,7 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)
@@ -416,7 +426,7 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)
@@ -437,7 +447,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
- int y, h_size;
+ int y, h_size, vshift;
YUV2RGB_LOOP(4)