summaryrefslogtreecommitdiff
path: root/libswscale/x86/swscale_template.c
diff options
context:
space:
mode:
Diffstat (limited to 'libswscale/x86/swscale_template.c')
-rw-r--r--libswscale/x86/swscale_template.c540
1 files changed, 240 insertions, 300 deletions
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 3fb8bc8882..1a102271c1 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -1,20 +1,20 @@
/*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -25,21 +25,101 @@
#undef REAL_MOVNTQ
#undef MOVNTQ
+#undef MOVNTQ2
#undef PREFETCH
-#if COMPILE_TEMPLATE_MMXEXT
-#define PREFETCH "prefetchnta"
-#else
-#define PREFETCH " # nop"
-#endif
#if COMPILE_TEMPLATE_MMXEXT
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movntq "
#else
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movq "
#endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
+#if !COMPILE_TEMPLATE_MMXEXT
+static av_always_inline void
+dither_8to16(const uint8_t *srcDither, int rot)
+{
+ if (rot) {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "psrlq $24, %%mm3\n\t"
+ "psllq $40, %%mm4\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ :: "r"(srcDither)
+ );
+ } else {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ :: "r"(srcDither)
+ );
+ }
+}
+#endif
+
+static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset)
+{
+ dither_8to16(dither, offset);
+ filterSize--;
+ __asm__ volatile(
+ "movd %0, %%mm1\n\t"
+ "punpcklwd %%mm1, %%mm1\n\t"
+ "punpckldq %%mm1, %%mm1\n\t"
+ "psllw $3, %%mm1\n\t"
+ "paddw %%mm1, %%mm3\n\t"
+ "paddw %%mm1, %%mm4\n\t"
+ "psraw $4, %%mm3\n\t"
+ "psraw $4, %%mm4\n\t"
+ ::"m"(filterSize)
+ );
+
+ __asm__ volatile(\
+ "movq %%mm3, %%mm6\n\t"
+ "movq %%mm4, %%mm7\n\t"
+ "movl %3, %%ecx\n\t"
+ "mov %0, %%"FF_REG_d" \n\t"\
+ "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
+ ".p2align 4 \n\t" /* FIXME Unroll? */\
+ "1: \n\t"\
+ "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
+ "movq (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\
+ "movq 8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\
+ "add $16, %%"FF_REG_d" \n\t"\
+ "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
+ "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
+ "pmulhw %%mm0, %%mm2 \n\t"\
+ "pmulhw %%mm0, %%mm5 \n\t"\
+ "paddw %%mm2, %%mm3 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ " jnz 1b \n\t"\
+ "psraw $3, %%mm3 \n\t"\
+ "psraw $3, %%mm4 \n\t"\
+ "packuswb %%mm4, %%mm3 \n\t"
+ MOVNTQ2 " %%mm3, (%1, %%"FF_REG_c")\n\t"
+ "add $8, %%"FF_REG_c" \n\t"\
+ "cmp %2, %%"FF_REG_c" \n\t"\
+ "movq %%mm6, %%mm3\n\t"
+ "movq %%mm7, %%mm4\n\t"
+ "mov %0, %%"FF_REG_d" \n\t"\
+ "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
+ "jb 1b \n\t"\
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+ : "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
+ );
+}
+
#define YSCALEYUV2PACKEDX_UV \
__asm__ volatile(\
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
@@ -53,11 +133,11 @@
".p2align 4 \n\t"\
"2: \n\t"\
"movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
- "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\
- "add %6, %%"FF_REG_S" \n\t" \
- "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\
- "add $16, %%"FF_REG_d" \n\t"\
- "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
+ "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\
+ "add %6, %%"FF_REG_S" \n\t" \
+ "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\
+ "add $16, %%"FF_REG_d" \n\t"\
+ "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\
"pmulhw %%mm0, %%mm5 \n\t"\
"paddw %%mm2, %%mm3 \n\t"\
@@ -92,7 +172,8 @@
:: "r" (&c->redDither), \
"m" (dummy), "m" (dummy), "m" (dummy),\
"r" (dest), "m" (dstW_reg), "m"(uv_off) \
- : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \
+ NAMED_CONSTRAINTS_ADD(bF8,bFC) \
+ : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \
);
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
@@ -110,7 +191,7 @@
".p2align 4 \n\t"\
"2: \n\t"\
"movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\
- "add %6, %%"FF_REG_S" \n\t" \
+ "add %6, %%"FF_REG_S" \n\t" \
"movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\
"mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
"movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\
@@ -122,7 +203,7 @@
"pmaddwd %%mm1, %%mm3 \n\t"\
"paddd %%mm0, %%mm4 \n\t"\
"paddd %%mm3, %%mm5 \n\t"\
- "add %6, %%"FF_REG_S" \n\t"\
+ "add %6, %%"FF_REG_S" \n\t" \
"movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\
"mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
"add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
@@ -148,18 +229,18 @@
"movq %%mm6, "V_TEMP"(%0) \n\t"\
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
- "lea "offset"(%0), %%"FF_REG_d" \n\t"\
- "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
+ "lea "offset"(%0), %%"FF_REG_d" \n\t"\
+ "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
"pxor %%mm1, %%mm1 \n\t"\
"pxor %%mm5, %%mm5 \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
".p2align 4 \n\t"\
"2: \n\t"\
- "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
- "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
- "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
- "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
+ "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
+ "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
+ "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
+ "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
"movq %%mm0, %%mm3 \n\t"\
"punpcklwd %%mm4, %%mm0 \n\t"\
"punpckhwd %%mm4, %%mm3 \n\t"\
@@ -252,7 +333,7 @@
MOVNTQ( q3, 24(dst, index, 4))\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
@@ -265,9 +346,9 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"movq %%mm2, "U_TEMP"(%0) \n\t"
@@ -278,13 +359,13 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
- WRITEBGR32(%4, %5, %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
+ WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%4, %5, %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
@@ -298,22 +379,51 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
+
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
+ YSCALEYUV2PACKEDX
+ YSCALEYUV2RGBX
+ YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
+ "psraw $3, %%mm1 \n\t"
+ "psraw $3, %%mm7 \n\t"
+ "packuswb %%mm7, %%mm1 \n\t"
+ WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ YSCALEYUV2PACKEDX_END
+ } else {
+ YSCALEYUV2PACKEDX
+ YSCALEYUV2RGBX
+ "pcmpeqd %%mm7, %%mm7 \n\t"
+ WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ YSCALEYUV2PACKEDX_END
+ }
+}
+
+static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ int chrFilterSize, const int16_t **alpSrc,
+ uint8_t *dest, int dstW, int dstY)
+{
+ x86_reg dummy=0;
+ x86_reg dstW_reg = dstW;
+ x86_reg uv_off = c->uv_offx2;
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
- WRITEBGR32(%4, %5, %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%4, %5, %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
@@ -342,7 +452,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
@@ -355,7 +465,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
@@ -366,7 +476,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
- WRITERGB16(%4, %5, %%FF_REGa)
+ WRITERGB16(%4, "%5", %%FF_REGa)
YSCALEYUV2PACKEDX_END
}
@@ -379,7 +489,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
@@ -390,7 +500,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
#endif
- WRITERGB16(%4, %5, %%FF_REGa)
+ WRITERGB16(%4, "%5", %%FF_REGa)
YSCALEYUV2PACKEDX_END
}
@@ -419,7 +529,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
@@ -432,7 +542,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
@@ -443,7 +553,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
- WRITERGB15(%4, %5, %%FF_REGa)
+ WRITERGB15(%4, "%5", %%FF_REGa)
YSCALEYUV2PACKEDX_END
}
@@ -456,7 +566,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
@@ -467,7 +577,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
#endif
- WRITERGB15(%4, %5, %%FF_REGa)
+ WRITERGB15(%4, "%5", %%FF_REGa)
YSCALEYUV2PACKEDX_END
}
@@ -521,7 +631,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
"add $24, "#dst" \n\t"\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR24MMXEXT(dst, dstw, index) \
@@ -569,7 +679,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
"add $24, "#dst" \n\t"\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#if COMPILE_TEMPLATE_MMXEXT
@@ -580,6 +690,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
#endif
+#if HAVE_6REGS
static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
@@ -589,17 +700,18 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
- "pxor %%mm7, %%mm7 \n\t"
- "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" // FIXME optimize
- "add %4, %%"FF_REG_c" \n\t"
- WRITEBGR24(%%FF_REGc, %5, %%FF_REGa)
+ "pxor %%mm7, %%mm7 \n\t"
+ "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize
+ "add %4, %%"FF_REG_c" \n\t"
+ WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg), "m"(uv_off)
+ NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
);
}
@@ -613,20 +725,22 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
- "pxor %%mm7, %%mm7 \n\t"
- "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" // FIXME optimize
- "add %4, %%"FF_REG_c" \n\t"
- WRITEBGR24(%%FF_REGc, %5, %%FF_REGa)
+ "pxor %%mm7, %%mm7 \n\t"
+ "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize
+ "add %4, %%"FF_REG_c" \n\t"
+ WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg), "m"(uv_off)
+ NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
);
}
+#endif /* HAVE_6REGS */
#define REAL_WRITEYUY2(dst, dstw, index) \
"packuswb %%mm3, %%mm3 \n\t"\
@@ -641,7 +755,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
MOVNTQ(%%mm7, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
- "cmp "#dstw", "#index" \n\t"\
+ "cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
@@ -654,7 +768,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX_ACCURATE
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -662,7 +776,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
- WRITEYUY2(%4, %5, %%FF_REGa)
+ WRITEYUY2(%4, "%5", %%FF_REGa)
YSCALEYUV2PACKEDX_END
}
@@ -675,7 +789,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
- x86_reg uv_off = c->uv_off_byte;
+ x86_reg uv_off = c->uv_offx2;
YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -683,7 +797,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
- WRITEYUY2(%4, %5, %%FF_REGa)
+ WRITEYUY2(%4, "%5", %%FF_REGa)
YSCALEYUV2PACKEDX_END
}
@@ -775,7 +889,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
#if ARCH_X86_64
__asm__ volatile(
@@ -784,18 +898,18 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
"psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"packuswb %%mm7, %%mm1 \n\t"
- WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
"a" (&c->redDither),
"r" (abuf0), "r" (abuf1)
: "%r8"
);
#else
- *(const uint16_t **)(&c->u_temp)=abuf0;
- *(const uint16_t **)(&c->v_temp)=abuf1;
+ c->u_temp=(intptr_t)abuf0;
+ c->v_temp=(intptr_t)abuf1;
__asm__ volatile(
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"FF_REG_b" \n\t"
+ "mov %4, %%"FF_REG_b" \n\t"
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2RGB(%%FF_REGBP, %5)
"push %0 \n\t"
@@ -808,7 +922,7 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
"packuswb %%mm7, %%mm1 \n\t"
"pop %1 \n\t"
"pop %0 \n\t"
- WRITEBGR32(%%FF_REGb, 8280(%5), %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
+ WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -818,11 +932,11 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
} else {
__asm__ volatile(
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"FF_REG_b" \n\t"
+ "mov %4, %%"FF_REG_b" \n\t"
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2RGB(%%FF_REGBP, %5)
"pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%%FF_REGb, 8280(%5), %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -839,18 +953,18 @@ static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"FF_REG_b" \n\t"
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2RGB(%%FF_REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
- WRITEBGR24(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
);
}
@@ -862,24 +976,24 @@ static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"FF_REG_b" \n\t"
+ "mov %4, %%"FF_REG_b" \n\t"
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2RGB(%%FF_REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB15(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8)
);
}
@@ -891,7 +1005,6 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"FF_REG_b" \n\t"
@@ -900,15 +1013,16 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB16(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8,bFC)
);
}
@@ -960,15 +1074,14 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
- //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
- "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
- "mov %4, %%"FF_REG_b" \n\t"
- "push %%"FF_REG_BP" \n\t"
+ "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
+ "mov %4, %%"FF_REG_b" \n\t"
+ "push %%"FF_REG_BP" \n\t"
YSCALEYUV2PACKED(%%FF_REGBP, %5)
- WRITEYUY2(%%FF_REGb, 8280(%5), %%FF_REGBP)
- "pop %%"FF_REG_BP" \n\t"
- "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
+ WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
+ "pop %%"FF_REG_BP" \n\t"
+ "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
);
@@ -1102,14 +1215,14 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
const int16_t *ubuf1 = ubuf[0];
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
__asm__ volatile(
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"FF_REG_b" \n\t"
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2RGB1(%%FF_REGBP, %5)
YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
- WRITEBGR32(%%FF_REGb, 8280(%5), %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1122,7 +1235,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2RGB1(%%FF_REGBP, %5)
"pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%%FF_REGb, 8280(%5), %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1131,14 +1244,14 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
}
} else {
const int16_t *ubuf1 = ubuf[1];
- if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+ if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
__asm__ volatile(
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"FF_REG_b" \n\t"
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2RGB1b(%%FF_REGBP, %5)
YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
- WRITEBGR32(%%FF_REGb, 8280(%5), %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1151,7 +1264,7 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2RGB1b(%%FF_REGBP, %5)
"pcmpeqd %%mm7, %%mm7 \n\t"
- WRITEBGR32(%%FF_REGb, 8280(%5), %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+ WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1177,11 +1290,12 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2RGB1(%%FF_REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
- WRITEBGR24(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
);
} else {
const int16_t *ubuf1 = ubuf[1];
@@ -1191,11 +1305,12 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2RGB1b(%%FF_REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
- WRITEBGR24(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
);
}
}
@@ -1218,15 +1333,16 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB15(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8)
);
} else {
const int16_t *ubuf1 = ubuf[1];
@@ -1238,15 +1354,16 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB15(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8)
);
}
}
@@ -1269,15 +1386,16 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB16(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8,bFC)
);
} else {
const int16_t *ubuf1 = ubuf[1];
@@ -1289,15 +1407,16 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
- "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
+ "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
- "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
+ "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
- WRITERGB16(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
+ NAMED_CONSTRAINTS_ADD(bF8,bFC)
);
}
}
@@ -1354,7 +1473,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
"mov %4, %%"FF_REG_b" \n\t"
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2PACKED1(%%FF_REGBP, %5)
- WRITEYUY2(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1367,7 +1486,7 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
"mov %4, %%"FF_REG_b" \n\t"
"push %%"FF_REG_BP" \n\t"
YSCALEYUV2PACKED1b(%%FF_REGBP, %5)
- WRITEYUY2(%%FF_REGb, 8280(%5), %%FF_REGBP)
+ WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
"pop %%"FF_REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
@@ -1375,203 +1494,20 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
);
}
}
-
-#if COMPILE_TEMPLATE_MMXEXT
-static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
- int dstWidth, const uint8_t *src,
- int srcW, int xInc)
-{
- int32_t *filterPos = c->hLumFilterPos;
- int16_t *filter = c->hLumFilter;
- void *mmxextFilterCode = c->lumMmxextFilterCode;
- int i;
-#if defined(PIC)
- uint64_t ebxsave;
-#endif
-#if ARCH_X86_64
- uint64_t retsave;
-#endif
-
- __asm__ volatile(
-#if defined(PIC)
- "mov %%"FF_REG_b", %5 \n\t"
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"FF_REG_a" \n\t"
- "mov %%"FF_REG_a", %6 \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"FF_REG_a" \n\t"
- "mov %%"FF_REG_a", %5 \n\t"
-#endif
-#endif
- "pxor %%mm7, %%mm7 \n\t"
- "mov %0, %%"FF_REG_c" \n\t"
- "mov %1, %%"FF_REG_D" \n\t"
- "mov %2, %%"FF_REG_d" \n\t"
- "mov %3, %%"FF_REG_b" \n\t"
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
- PREFETCH" (%%"FF_REG_c") \n\t"
- PREFETCH" 32(%%"FF_REG_c") \n\t"
- PREFETCH" 64(%%"FF_REG_c") \n\t"
-
-#if ARCH_X86_64
-#define CALL_MMXEXT_FILTER_CODE \
- "movl (%%"FF_REG_b"), %%esi \n\t"\
- "call *%4 \n\t"\
- "movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\
- "add %%"FF_REG_S", %%"FF_REG_c" \n\t"\
- "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
-
-#else
-#define CALL_MMXEXT_FILTER_CODE \
- "movl (%%"FF_REG_b"), %%esi \n\t"\
- "call *%4 \n\t"\
- "addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\
- "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
-
-#endif /* ARCH_X86_64 */
-
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
-
-#if defined(PIC)
- "mov %5, %%"FF_REG_b" \n\t"
-#if ARCH_X86_64
- "mov %6, %%"FF_REG_a" \n\t"
- "mov %%"FF_REG_a", -8(%%rsp) \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov %5, %%"FF_REG_a" \n\t"
- "mov %%"FF_REG_a", -8(%%rsp) \n\t"
-#endif
-#endif
- :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
- "m" (mmxextFilterCode)
-#if defined(PIC)
- ,"m" (ebxsave)
-#endif
-#if ARCH_X86_64
- ,"m"(retsave)
-#endif
- : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
-#if !defined(PIC)
- ,"%"FF_REG_b
-#endif
- );
-
- for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
- dst[i] = src[srcW-1]*128;
-}
-
-static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
- int dstWidth, const uint8_t *src1,
- const uint8_t *src2, int srcW, int xInc)
-{
- int32_t *filterPos = c->hChrFilterPos;
- int16_t *filter = c->hChrFilter;
- void *mmxextFilterCode = c->chrMmxextFilterCode;
- int i;
-#if defined(PIC)
- DECLARE_ALIGNED(8, uint64_t, ebxsave);
-#endif
-#if ARCH_X86_64
- DECLARE_ALIGNED(8, uint64_t, retsave);
-#endif
-
- __asm__ volatile(
-#if defined(PIC)
- "mov %%"FF_REG_b", %7 \n\t"
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"FF_REG_a" \n\t"
- "mov %%"FF_REG_a", %8 \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov -8(%%rsp), %%"FF_REG_a" \n\t"
- "mov %%"FF_REG_a", %7 \n\t"
-#endif
-#endif
- "pxor %%mm7, %%mm7 \n\t"
- "mov %0, %%"FF_REG_c" \n\t"
- "mov %1, %%"FF_REG_D" \n\t"
- "mov %2, %%"FF_REG_d" \n\t"
- "mov %3, %%"FF_REG_b" \n\t"
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
- PREFETCH" (%%"FF_REG_c") \n\t"
- PREFETCH" 32(%%"FF_REG_c") \n\t"
- PREFETCH" 64(%%"FF_REG_c") \n\t"
-
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i
- "mov %5, %%"FF_REG_c" \n\t" // src
- "mov %6, %%"FF_REG_D" \n\t" // buf2
- PREFETCH" (%%"FF_REG_c") \n\t"
- PREFETCH" 32(%%"FF_REG_c") \n\t"
- PREFETCH" 64(%%"FF_REG_c") \n\t"
-
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
- CALL_MMXEXT_FILTER_CODE
-
-#if defined(PIC)
- "mov %7, %%"FF_REG_b" \n\t"
-#if ARCH_X86_64
- "mov %8, %%"FF_REG_a" \n\t"
- "mov %%"FF_REG_a", -8(%%rsp) \n\t"
-#endif
-#else
-#if ARCH_X86_64
- "mov %7, %%"FF_REG_a" \n\t"
- "mov %%"FF_REG_a", -8(%%rsp) \n\t"
-#endif
-#endif
- :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
- "m" (mmxextFilterCode), "m" (src2), "m"(dst2)
-#if defined(PIC)
- ,"m" (ebxsave)
-#endif
-#if ARCH_X86_64
- ,"m"(retsave)
-#endif
- : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D
-#if !defined(PIC)
- ,"%"FF_REG_b
-#endif
- );
-
- for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
- dst1[i] = src1[srcW-1]*128;
- dst2[i] = src2[srcW-1]*128;
- }
-}
-#endif /* COMPILE_TEMPLATE_MMXEXT */
-
static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
{
enum AVPixelFormat dstFormat = c->dstFormat;
- if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
- dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21) {
- if (!(c->flags & SWS_BITEXACT)) {
+ c->use_mmx_vfilter= 0;
+ if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
+ && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
if (c->flags & SWS_ACCURATE_RND) {
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
+#if HAVE_6REGS
case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
+#endif
case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
@@ -1579,10 +1515,15 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}
}
} else {
+ c->use_mmx_vfilter= 1;
+ c->yuv2planeX = RENAME(yuv2yuvX );
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
+ case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break;
+#if HAVE_6REGS
case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
+#endif
case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
@@ -1590,7 +1531,6 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}
}
}
- }
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
@@ -1619,12 +1559,12 @@ static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
}
}
- if (c->srcBpc == 8 && c->dstBpc <= 10) {
+ if (c->srcBpc == 8 && c->dstBpc <= 14) {
// Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
#if COMPILE_TEMPLATE_MMXEXT
if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
- c->hyscale_fast = RENAME(hyscale_fast);
- c->hcscale_fast = RENAME(hcscale_fast);
+ c->hyscale_fast = ff_hyscale_fast_mmxext;
+ c->hcscale_fast = ff_hcscale_fast_mmxext;
} else {
#endif /* COMPILE_TEMPLATE_MMXEXT */
c->hyscale_fast = NULL;