summaryrefslogtreecommitdiff
path: root/libswscale/x86/swscale_template.c
diff options
context:
space:
mode:
Diffstat (limited to 'libswscale/x86/swscale_template.c')
-rw-r--r--libswscale/x86/swscale_template.c356
1 files changed, 291 insertions, 65 deletions
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index f58ac520e1..ae0d394078 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -1,20 +1,20 @@
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -37,8 +37,8 @@
#define YSCALEYUV2YV12X(offset, dest, end, pos) \
__asm__ volatile(\
- "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
- "movq %%mm3, %%mm4 \n\t"\
+ "movq "DITHER16"+0(%0), %%mm3 \n\t"\
+ "movq "DITHER16"+8(%0), %%mm4 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
".p2align 4 \n\t" /* FIXME Unroll? */\
@@ -60,8 +60,8 @@
MOVNTQ(%%mm3, (%1, %3))\
"add $8, %3 \n\t"\
"cmp %2, %3 \n\t"\
- "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
- "movq %%mm3, %%mm4 \n\t"\
+ "movq "DITHER16"+0(%0), %%mm3 \n\t"\
+ "movq "DITHER16"+8(%0), %%mm4 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"\
@@ -75,16 +75,21 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest[4], int dstW, int chrDstW)
+ uint8_t *dest[4], int dstW, int chrDstW,
+ const uint8_t *lumDither, const uint8_t *chrDither)
{
+ int i;
uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
*aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
if (uDest) {
x86_reg uv_off = c->uv_off;
+ for(i=0; i<8; i++) c->dither16[i] = chrDither[i]>>4;
YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
+ for(i=0; i<8; i++) c->dither16[i] = chrDither[(i+3)&7]>>4;
YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
}
+ for(i=0; i<8; i++) c->dither16[i] = lumDither[i]>>4;
if (CONFIG_SWSCALE_ALPHA && aDest) {
YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
}
@@ -95,6 +100,10 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
#define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
__asm__ volatile(\
"lea " offset "(%0), %%"REG_d" \n\t"\
+ "movq "DITHER32"+0(%0), %%mm4 \n\t"\
+ "movq "DITHER32"+8(%0), %%mm5 \n\t"\
+ "movq "DITHER32"+16(%0), %%mm6 \n\t"\
+ "movq "DITHER32"+24(%0), %%mm7 \n\t"\
"pxor %%mm4, %%mm4 \n\t"\
"pxor %%mm5, %%mm5 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
@@ -126,26 +135,21 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
"paddd %%mm2, %%mm6 \n\t"\
"paddd %%mm0, %%mm7 \n\t"\
" jnz 1b \n\t"\
- "psrad $16, %%mm4 \n\t"\
- "psrad $16, %%mm5 \n\t"\
- "psrad $16, %%mm6 \n\t"\
- "psrad $16, %%mm7 \n\t"\
- "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
+ "psrad $19, %%mm4 \n\t"\
+ "psrad $19, %%mm5 \n\t"\
+ "psrad $19, %%mm6 \n\t"\
+ "psrad $19, %%mm7 \n\t"\
"packssdw %%mm5, %%mm4 \n\t"\
"packssdw %%mm7, %%mm6 \n\t"\
- "paddw %%mm0, %%mm4 \n\t"\
- "paddw %%mm0, %%mm6 \n\t"\
- "psraw $3, %%mm4 \n\t"\
- "psraw $3, %%mm6 \n\t"\
"packuswb %%mm6, %%mm4 \n\t"\
MOVNTQ(%%mm4, (%1, %3))\
"add $8, %3 \n\t"\
"cmp %2, %3 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\
- "pxor %%mm4, %%mm4 \n\t"\
- "pxor %%mm5, %%mm5 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "pxor %%mm7, %%mm7 \n\t"\
+ "movq "DITHER32"+0(%0), %%mm4 \n\t"\
+ "movq "DITHER32"+8(%0), %%mm5 \n\t"\
+ "movq "DITHER32"+16(%0), %%mm6 \n\t"\
+ "movq "DITHER32"+24(%0), %%mm7 \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"\
:: "r" (&c->redDither),\
@@ -158,16 +162,21 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
- uint8_t *dest[4], int dstW, int chrDstW)
+ uint8_t *dest[4], int dstW, int chrDstW,
+ const uint8_t *lumDither, const uint8_t *chrDither)
{
+ int i;
uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
*aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
if (uDest) {
x86_reg uv_off = c->uv_off;
+ for(i=0; i<8; i++) c->dither32[i] = chrDither[i]<<12;
YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
+ for(i=0; i<8; i++) c->dither32[i] = chrDither[(i+3)&7]<<12;
YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
}
+ for(i=0; i<8; i++) c->dither32[i] = lumDither[i]<<12;
if (CONFIG_SWSCALE_ALPHA && aDest) {
YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
}
@@ -178,7 +187,8 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
const int16_t *chrUSrc, const int16_t *chrVSrc,
const int16_t *alpSrc,
- uint8_t *dst[4], int dstW, int chrDstW)
+ uint8_t *dst[4], int dstW, int chrDstW,
+ const uint8_t *lumDither, const uint8_t *chrDither)
{
int p= 4;
const int16_t *src[4]= {
@@ -212,7 +222,8 @@ static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
const int16_t *chrUSrc, const int16_t *chrVSrc,
const int16_t *alpSrc,
- uint8_t *dst[4], int dstW, int chrDstW)
+ uint8_t *dst[4], int dstW, int chrDstW,
+ const uint8_t *lumDither, const uint8_t *chrDither)
{
int p= 4;
const int16_t *src[4]= {
@@ -223,16 +234,17 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
while (p--) {
if (dst[p]) {
+ int i;
+ for(i=0; i<8; i++) c->dither16[i] = i<2 ? lumDither[i] : chrDither[i];
__asm__ volatile(
"mov %2, %%"REG_a" \n\t"
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $15, %%mm7 \n\t"
- "psllw $6, %%mm7 \n\t"
+ "movq 0(%3), %%mm6 \n\t"
+ "movq 8(%3), %%mm7 \n\t"
".p2align 4 \n\t" /* FIXME Unroll? */
"1: \n\t"
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"
"movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
- "paddsw %%mm7, %%mm0 \n\t"
+ "paddsw %%mm6, %%mm0 \n\t"
"paddsw %%mm7, %%mm1 \n\t"
"psraw $7, %%mm0 \n\t"
"psraw $7, %%mm1 \n\t"
@@ -241,7 +253,7 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
"add $8, %%"REG_a" \n\t"
"jnc 1b \n\t"
:: "r" (src[p]), "r" (dst[p] + counter[p]),
- "g" (-counter[p])
+ "g" (-counter[p]), "r"(c->dither16)
: "%"REG_a
);
}
@@ -999,8 +1011,8 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
: "%r8"
);
#else
- *(const uint16_t **)(&c->u_temp)=abuf0;
- *(const uint16_t **)(&c->v_temp)=abuf1;
+ c->u_temp=(intptr_t)abuf0;
+ c->v_temp=(intptr_t)abuf1;
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
@@ -1625,6 +1637,32 @@ static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
assert(src1 == src2);
}
+static void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src1, const uint8_t *src2,
+ int width, uint32_t *unused)
+{
+ __asm__ volatile(
+ "mov %0, %%"REG_a" \n\t"
+ "1: \n\t"
+ "movq (%1, %%"REG_a",2), %%mm0 \n\t"
+ "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
+ "movq (%2, %%"REG_a",2), %%mm2 \n\t"
+ "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
+ "psrlw $8, %%mm0 \n\t"
+ "psrlw $8, %%mm1 \n\t"
+ "psrlw $8, %%mm2 \n\t"
+ "psrlw $8, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "movq %%mm0, (%3, %%"REG_a") \n\t"
+ "movq %%mm2, (%4, %%"REG_a") \n\t"
+ "add $8, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
+ : "%"REG_a
+ );
+}
+
/* This is almost identical to the previous, end exists only because
* yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src,
@@ -1674,6 +1712,33 @@ static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
assert(src1 == src2);
}
+static void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src1, const uint8_t *src2,
+ int width, uint32_t *unused)
+{
+ __asm__ volatile(
+ "movq "MANGLE(bm01010101)", %%mm4 \n\t"
+ "mov %0, %%"REG_a" \n\t"
+ "1: \n\t"
+ "movq (%1, %%"REG_a",2), %%mm0 \n\t"
+ "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
+ "movq (%2, %%"REG_a",2), %%mm2 \n\t"
+ "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
+ "pand %%mm4, %%mm0 \n\t"
+ "pand %%mm4, %%mm1 \n\t"
+ "pand %%mm4, %%mm2 \n\t"
+ "pand %%mm4, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "movq %%mm0, (%3, %%"REG_a") \n\t"
+ "movq %%mm2, (%4, %%"REG_a") \n\t"
+ "add $8, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
+ : "%"REG_a
+ );
+}
+
static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
const uint8_t *src, int width)
{
@@ -1715,7 +1780,7 @@ static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
}
#endif /* !COMPILE_TEMPLATE_MMX2 */
-static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src,
+static av_always_inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src,
int width, enum PixelFormat srcFormat)
{
@@ -1756,32 +1821,31 @@ static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *s
"paddd %%mm3, %%mm2 \n\t"
"paddd %%mm4, %%mm0 \n\t"
"paddd %%mm4, %%mm2 \n\t"
- "psrad $15, %%mm0 \n\t"
- "psrad $15, %%mm2 \n\t"
+ "psrad $9, %%mm0 \n\t"
+ "psrad $9, %%mm2 \n\t"
"packssdw %%mm2, %%mm0 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "movd %%mm0, (%1, %%"REG_a") \n\t"
- "add $4, %%"REG_a" \n\t"
+ "movq %%mm0, (%1, %%"REG_a") \n\t"
+ "add $8, %%"REG_a" \n\t"
" js 1b \n\t"
: "+r" (src)
- : "r" (dst+width), "g" ((x86_reg)-width)
+ : "r" (dst+width), "g" ((x86_reg)-2*width)
: "%"REG_a
);
}
-static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src,
+static void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src,
int width, uint32_t *unused)
{
RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
}
-static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src,
+static void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src,
int width, uint32_t *unused)
{
RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
}
-static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV,
+static av_always_inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV,
const uint8_t *src, int width,
enum PixelFormat srcFormat)
{
@@ -1823,25 +1887,23 @@ static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV,
"paddd %%mm3, %%mm2 \n\t"
"paddd %%mm3, %%mm1 \n\t"
"paddd %%mm3, %%mm4 \n\t"
- "psrad $15, %%mm0 \n\t"
- "psrad $15, %%mm2 \n\t"
- "psrad $15, %%mm1 \n\t"
- "psrad $15, %%mm4 \n\t"
+ "psrad $9, %%mm0 \n\t"
+ "psrad $9, %%mm2 \n\t"
+ "psrad $9, %%mm1 \n\t"
+ "psrad $9, %%mm4 \n\t"
"packssdw %%mm1, %%mm0 \n\t"
"packssdw %%mm4, %%mm2 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm2, %%mm2 \n\t"
- "movd %%mm0, (%1, %%"REG_a") \n\t"
- "movd %%mm2, (%2, %%"REG_a") \n\t"
- "add $4, %%"REG_a" \n\t"
+ "movq %%mm0, (%1, %%"REG_a") \n\t"
+ "movq %%mm2, (%2, %%"REG_a") \n\t"
+ "add $8, %%"REG_a" \n\t"
" js 1b \n\t"
: "+r" (src)
- : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
+ : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
: "%"REG_a
);
}
-static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV,
+static void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV,
const uint8_t *src1, const uint8_t *src2,
int width, uint32_t *unused)
{
@@ -1849,7 +1911,7 @@ static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV,
assert(src1 == src2);
}
-static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV,
+static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV,
const uint8_t *src1, const uint8_t *src2,
int width, uint32_t *unused)
{
@@ -1859,7 +1921,7 @@ static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV,
#if !COMPILE_TEMPLATE_MMX2
// bilinear / bicubic scaling
-static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW,
+static void RENAME(hScale)(int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int16_t *filterPos, int filterSize)
{
@@ -2015,6 +2077,163 @@ static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW,
}
#endif /* !COMPILE_TEMPLATE_MMX2 */
+static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
+ const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
+{
+ int i, j;
+
+ assert(filterSize % 4 == 0 && filterSize>0);
+ if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
+ x86_reg counter= -2*dstW;
+ filter-= counter*2;
+ filterPos-= counter/2;
+ dst-= counter/2;
+ __asm__ volatile(
+ "movd %5, %%mm7 \n\t"
+#if defined(PIC)
+ "push %%"REG_b" \n\t"
+#endif
+ "push %%"REG_BP" \n\t" // we use 7 regs here ...
+ "mov %%"REG_a", %%"REG_BP" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movzwl (%2, %%"REG_BP"), %%eax \n\t"
+ "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
+ "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
+ "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
+ "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
+ "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
+ "pmaddwd %%mm1, %%mm0 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+ "movq %%mm0, %%mm4 \n\t"
+ "punpckldq %%mm3, %%mm0 \n\t"
+ "punpckhdq %%mm3, %%mm4 \n\t"
+ "paddd %%mm4, %%mm0 \n\t"
+ "psrad %%mm7, %%mm0 \n\t"
+ "packssdw %%mm0, %%mm0 \n\t"
+ "movd %%mm0, (%4, %%"REG_BP") \n\t"
+ "add $4, %%"REG_BP" \n\t"
+ " jnc 1b \n\t"
+
+ "pop %%"REG_BP" \n\t"
+#if defined(PIC)
+ "pop %%"REG_b" \n\t"
+#endif
+ : "+a" (counter)
+ : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
+#if !defined(PIC)
+ : "%"REG_b
+#endif
+ );
+ } else if (filterSize==8 && shift<15) {
+ x86_reg counter= -2*dstW;
+ filter-= counter*4;
+ filterPos-= counter/2;
+ dst-= counter/2;
+ __asm__ volatile(
+ "movd %5, %%mm7 \n\t"
+#if defined(PIC)
+ "push %%"REG_b" \n\t"
+#endif
+ "push %%"REG_BP" \n\t" // we use 7 regs here ...
+ "mov %%"REG_a", %%"REG_BP" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movzwl (%2, %%"REG_BP"), %%eax \n\t"
+ "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
+ "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
+ "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
+ "movq (%3, %%"REG_a", 2), %%mm0 \n\t"
+ "movq (%3, %%"REG_b", 2), %%mm2 \n\t"
+ "pmaddwd %%mm1, %%mm0 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+
+ "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
+ "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
+ "movq 8(%3, %%"REG_a", 2), %%mm4 \n\t"
+ "movq 8(%3, %%"REG_b", 2), %%mm2 \n\t"
+ "pmaddwd %%mm1, %%mm4 \n\t"
+ "pmaddwd %%mm2, %%mm5 \n\t"
+ "paddd %%mm4, %%mm0 \n\t"
+ "paddd %%mm5, %%mm3 \n\t"
+ "movq %%mm0, %%mm4 \n\t"
+ "punpckldq %%mm3, %%mm0 \n\t"
+ "punpckhdq %%mm3, %%mm4 \n\t"
+ "paddd %%mm4, %%mm0 \n\t"
+ "psrad %%mm7, %%mm0 \n\t"
+ "packssdw %%mm0, %%mm0 \n\t"
+ "movd %%mm0, (%4, %%"REG_BP") \n\t"
+ "add $4, %%"REG_BP" \n\t"
+ " jnc 1b \n\t"
+
+ "pop %%"REG_BP" \n\t"
+#if defined(PIC)
+ "pop %%"REG_b" \n\t"
+#endif
+ : "+a" (counter)
+ : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
+#if !defined(PIC)
+ : "%"REG_b
+#endif
+ );
+ } else if (shift<15){
+ const uint16_t *offset = src+filterSize;
+ x86_reg counter= -2*dstW;
+ //filter-= counter*filterSize/2;
+ filterPos-= counter/2;
+ dst-= counter/2;
+ __asm__ volatile(
+ "movd %7, %%mm7 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "mov %2, %%"REG_c" \n\t"
+ "movzwl (%%"REG_c", %0), %%eax \n\t"
+ "movzwl 2(%%"REG_c", %0), %%edx \n\t"
+ "mov %5, %%"REG_c" \n\t"
+ "pxor %%mm4, %%mm4 \n\t"
+ "pxor %%mm5, %%mm5 \n\t"
+ "2: \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq (%1, %6), %%mm3 \n\t"
+ "movq (%%"REG_c", %%"REG_a", 2), %%mm0 \n\t"
+ "movq (%%"REG_c", %%"REG_d", 2), %%mm2 \n\t"
+ "pmaddwd %%mm1, %%mm0 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+ "paddd %%mm3, %%mm5 \n\t"
+ "paddd %%mm0, %%mm4 \n\t"
+ "add $8, %1 \n\t"
+ "add $8, %%"REG_c" \n\t"
+ "cmp %4, %%"REG_c" \n\t"
+ " jb 2b \n\t"
+ "add %6, %1 \n\t"
+ "movq %%mm4, %%mm0 \n\t"
+ "punpckldq %%mm5, %%mm4 \n\t"
+ "punpckhdq %%mm5, %%mm0 \n\t"
+ "paddd %%mm0, %%mm4 \n\t"
+ "psrad %%mm7, %%mm4 \n\t"
+ "packssdw %%mm4, %%mm4 \n\t"
+ "mov %3, %%"REG_a" \n\t"
+ "movd %%mm4, (%%"REG_a", %0) \n\t"
+ "add $4, %0 \n\t"
+ " jnc 1b \n\t"
+
+ : "+r" (counter), "+r" (filter)
+ : "m" (filterPos), "m" (dst), "m"(offset),
+ "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
+ : "%"REG_a, "%"REG_c, "%"REG_d
+ );
+ } else
+ for (i=0; i<dstW; i++) {
+ int srcPos= filterPos[i];
+ int val=0;
+ for (j=0; j<filterSize; j++) {
+ val += ((int)src[srcPos + j])*filter[filterSize*i + j];
+ }
+ dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
+ }
+}
+
+
#if COMPILE_TEMPLATE_MMX2
static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
int dstWidth, const uint8_t *src,
@@ -2156,9 +2375,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
enum PixelFormat srcFormat = c->srcFormat,
dstFormat = c->dstFormat;
- if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
- dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
- if (!(c->flags & SWS_BITEXACT)) {
+ if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12
+ && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
if (c->flags & SWS_ACCURATE_RND) {
c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
c->yuv2yuvX = RENAME(yuv2yuvX_ar );
@@ -2173,7 +2391,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
}
} else {
- c->yuv2yuv1 = RENAME(yuv2yuv1 );
+ int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
+ c->yuv2yuv1 = should_dither ? RENAME(yuv2yuv1_ar ) : RENAME(yuv2yuv1 );
c->yuv2yuvX = RENAME(yuv2yuvX );
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
@@ -2186,7 +2405,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
}
}
- }
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case PIX_FMT_RGB32:
@@ -2215,7 +2433,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
}
- if (c->scalingBpp == 8) {
#if !COMPILE_TEMPLATE_MMX2
c->hScale = RENAME(hScale );
#endif /* !COMPILE_TEMPLATE_MMX2 */
@@ -2233,7 +2450,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
#if COMPILE_TEMPLATE_MMX2
}
#endif /* COMPILE_TEMPLATE_MMX2 */
- }
#if !COMPILE_TEMPLATE_MMX2
switch(srcFormat) {
@@ -2241,7 +2457,13 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
- default: break;
+ case PIX_FMT_GRAY16LE :
+ case PIX_FMT_YUV420P9LE:
+ case PIX_FMT_YUV422P10LE:
+ case PIX_FMT_YUV420P10LE:
+ case PIX_FMT_YUV420P16LE:
+ case PIX_FMT_YUV422P16LE:
+ case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break;
}
#endif /* !COMPILE_TEMPLATE_MMX2 */
if (!c->chrSrcHSubSample) {
@@ -2255,8 +2477,10 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
switch (srcFormat) {
#if !COMPILE_TEMPLATE_MMX2
case PIX_FMT_YUYV422 :
- case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break;
- case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break;
+ case PIX_FMT_Y400A :
+ c->lumToYV12 = RENAME(yuy2ToY); break;
+ case PIX_FMT_UYVY422 :
+ c->lumToYV12 = RENAME(uyvyToY); break;
#endif /* !COMPILE_TEMPLATE_MMX2 */
case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
@@ -2270,4 +2494,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
}
#endif /* !COMPILE_TEMPLATE_MMX2 */
+ if(isAnyRGB(c->srcFormat))
+ c->hScale16= RENAME(hScale16);
}