diff options
Diffstat (limited to 'libswscale/x86/swscale_template.c')
-rw-r--r-- | libswscale/x86/swscale_template.c | 356 |
1 files changed, 291 insertions, 65 deletions
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c index f58ac520e1..ae0d394078 100644 --- a/libswscale/x86/swscale_template.c +++ b/libswscale/x86/swscale_template.c @@ -1,20 +1,20 @@ /* * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -37,8 +37,8 @@ #define YSCALEYUV2YV12X(offset, dest, end, pos) \ __asm__ volatile(\ - "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ + "movq "DITHER16"+0(%0), %%mm3 \n\t"\ + "movq "DITHER16"+8(%0), %%mm4 \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ ".p2align 4 \n\t" /* FIXME Unroll? */\ @@ -60,8 +60,8 @@ MOVNTQ(%%mm3, (%1, %3))\ "add $8, %3 \n\t"\ "cmp %2, %3 \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ + "movq "DITHER16"+0(%0), %%mm3 \n\t"\ + "movq "DITHER16"+8(%0), %%mm4 \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ "jb 1b \n\t"\ @@ -75,16 +75,21 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest[4], int dstW, int chrDstW) + uint8_t *dest[4], int dstW, int chrDstW, + const uint8_t *lumDither, const uint8_t *chrDither) { + int i; uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; if (uDest) { x86_reg uv_off = c->uv_off; + for(i=0; i<8; i++) c->dither16[i] = chrDither[i]>>4; YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) + for(i=0; i<8; i++) c->dither16[i] = chrDither[(i+3)&7]>>4; YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) } + for(i=0; i<8; i++) c->dither16[i] = lumDither[i]>>4; if (CONFIG_SWSCALE_ALPHA && aDest) { YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) } @@ -95,6 +100,10 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \ __asm__ volatile(\ "lea " offset "(%0), %%"REG_d" \n\t"\ + "movq "DITHER32"+0(%0), %%mm4 \n\t"\ + "movq "DITHER32"+8(%0), %%mm5 \n\t"\ + "movq "DITHER32"+16(%0), %%mm6 \n\t"\ + "movq "DITHER32"+24(%0), %%mm7 \n\t"\ "pxor %%mm4, %%mm4 \n\t"\ "pxor %%mm5, %%mm5 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ @@ -126,26 +135,21 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, "paddd %%mm2, %%mm6 \n\t"\ "paddd %%mm0, %%mm7 \n\t"\ " jnz 1b \n\t"\ - "psrad $16, %%mm4 \n\t"\ - "psrad $16, %%mm5 \n\t"\ - "psrad $16, %%mm6 \n\t"\ - "psrad $16, %%mm7 \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ + "psrad $19, %%mm4 \n\t"\ + "psrad $19, %%mm5 \n\t"\ + "psrad $19, %%mm6 \n\t"\ + "psrad $19, %%mm7 \n\t"\ "packssdw %%mm5, %%mm4 \n\t"\ "packssdw %%mm7, %%mm6 \n\t"\ - "paddw %%mm0, %%mm4 \n\t"\ - "paddw %%mm0, %%mm6 \n\t"\ - "psraw $3, %%mm4 \n\t"\ - "psraw $3, %%mm6 \n\t"\ "packuswb %%mm6, %%mm4 \n\t"\ MOVNTQ(%%mm4, (%1, %3))\ "add $8, %3 \n\t"\ "cmp %2, %3 \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\ - "pxor %%mm4, %%mm4 \n\t"\ - "pxor %%mm5, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ + "movq "DITHER32"+0(%0), %%mm4 \n\t"\ + "movq "DITHER32"+8(%0), %%mm5 \n\t"\ + "movq "DITHER32"+16(%0), %%mm6 \n\t"\ + "movq "DITHER32"+24(%0), %%mm7 \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\ "jb 1b \n\t"\ :: "r" (&c->redDither),\ @@ -158,16 +162,21 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, - uint8_t *dest[4], int dstW, int chrDstW) + uint8_t *dest[4], int dstW, int chrDstW, + const uint8_t *lumDither, const uint8_t *chrDither) { + int i; uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; if (uDest) { x86_reg uv_off = c->uv_off; + for(i=0; i<8; i++) c->dither32[i] = chrDither[i]<<12; YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) + for(i=0; i<8; i++) c->dither32[i] = chrDither[(i+3)&7]<<12; YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) } + for(i=0; i<8; i++) c->dither32[i] = lumDither[i]<<12; if (CONFIG_SWSCALE_ALPHA && aDest) { YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) } @@ -178,7 +187,8 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrUSrc, const int16_t *chrVSrc, const int16_t *alpSrc, - uint8_t *dst[4], int dstW, int chrDstW) + uint8_t *dst[4], int dstW, int chrDstW, + const uint8_t *lumDither, const uint8_t *chrDither) { int p= 4; const int16_t *src[4]= { @@ -212,7 +222,8 @@ static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrUSrc, const int16_t *chrVSrc, const int16_t *alpSrc, - uint8_t *dst[4], int dstW, int chrDstW) + uint8_t *dst[4], int dstW, int chrDstW, + const uint8_t *lumDither, const uint8_t *chrDither) { int p= 4; const int16_t *src[4]= { @@ -223,16 +234,17 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, while (p--) { if (dst[p]) { + int i; + for(i=0; i<8; i++) c->dither16[i] = i<2 ? lumDither[i] : chrDither[i]; __asm__ volatile( "mov %2, %%"REG_a" \n\t" - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "psllw $6, %%mm7 \n\t" + "movq 0(%3), %%mm6 \n\t" + "movq 8(%3), %%mm7 \n\t" ".p2align 4 \n\t" /* FIXME Unroll? */ "1: \n\t" "movq (%0, %%"REG_a", 2), %%mm0 \n\t" "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t" - "paddsw %%mm7, %%mm0 \n\t" + "paddsw %%mm6, %%mm0 \n\t" "paddsw %%mm7, %%mm1 \n\t" "psraw $7, %%mm0 \n\t" "psraw $7, %%mm1 \n\t" @@ -241,7 +253,7 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, "add $8, %%"REG_a" \n\t" "jnc 1b \n\t" :: "r" (src[p]), "r" (dst[p] + counter[p]), - "g" (-counter[p]) + "g" (-counter[p]), "r"(c->dither16) : "%"REG_a ); } @@ -999,8 +1011,8 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], : "%r8" ); #else - *(const uint16_t **)(&c->u_temp)=abuf0; - *(const uint16_t **)(&c->v_temp)=abuf1; + c->u_temp=(intptr_t)abuf0; + c->v_temp=(intptr_t)abuf1; __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" @@ -1625,6 +1637,32 @@ static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, assert(src1 == src2); } +static void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, + const uint8_t *src1, const uint8_t *src2, + int width, uint32_t *unused) +{ + __asm__ volatile( + "mov %0, %%"REG_a" \n\t" + "1: \n\t" + "movq (%1, %%"REG_a",2), %%mm0 \n\t" + "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" + "movq (%2, %%"REG_a",2), %%mm2 \n\t" + "movq 8(%2, %%"REG_a",2), %%mm3 \n\t" + "psrlw $8, %%mm0 \n\t" + "psrlw $8, %%mm1 \n\t" + "psrlw $8, %%mm2 \n\t" + "psrlw $8, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, (%3, %%"REG_a") \n\t" + "movq %%mm2, (%4, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" + " js 1b \n\t" + : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width) + : "%"REG_a + ); +} + /* This is almost identical to the previous, end exists only because * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, @@ -1674,6 +1712,33 @@ static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, assert(src1 == src2); } +static void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, + const uint8_t *src1, const uint8_t *src2, + int width, uint32_t *unused) +{ + __asm__ volatile( + "movq "MANGLE(bm01010101)", %%mm4 \n\t" + "mov %0, %%"REG_a" \n\t" + "1: \n\t" + "movq (%1, %%"REG_a",2), %%mm0 \n\t" + "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" + "movq (%2, %%"REG_a",2), %%mm2 \n\t" + "movq 8(%2, %%"REG_a",2), %%mm3 \n\t" + "pand %%mm4, %%mm0 \n\t" + "pand %%mm4, %%mm1 \n\t" + "pand %%mm4, %%mm2 \n\t" + "pand %%mm4, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, (%3, %%"REG_a") \n\t" + "movq %%mm2, (%4, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" + " js 1b \n\t" + : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width) + : "%"REG_a + ); +} + static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int width) { @@ -1715,7 +1780,7 @@ static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV, } #endif /* !COMPILE_TEMPLATE_MMX2 */ -static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, +static av_always_inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src, int width, enum PixelFormat srcFormat) { @@ -1756,32 +1821,31 @@ static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *s "paddd %%mm3, %%mm2 \n\t" "paddd %%mm4, %%mm0 \n\t" "paddd %%mm4, %%mm2 \n\t" - "psrad $15, %%mm0 \n\t" - "psrad $15, %%mm2 \n\t" + "psrad $9, %%mm0 \n\t" + "psrad $9, %%mm2 \n\t" "packssdw %%mm2, %%mm0 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "movd %%mm0, (%1, %%"REG_a") \n\t" - "add $4, %%"REG_a" \n\t" + "movq %%mm0, (%1, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" " js 1b \n\t" : "+r" (src) - : "r" (dst+width), "g" ((x86_reg)-width) + : "r" (dst+width), "g" ((x86_reg)-2*width) : "%"REG_a ); } -static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, +static void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src, int width, uint32_t *unused) { RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24); } -static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, +static void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src, int width, uint32_t *unused) { RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24); } -static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, +static av_always_inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV, const uint8_t *src, int width, enum PixelFormat srcFormat) { @@ -1823,25 +1887,23 @@ static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, "paddd %%mm3, %%mm2 \n\t" "paddd %%mm3, %%mm1 \n\t" "paddd %%mm3, %%mm4 \n\t" - "psrad $15, %%mm0 \n\t" - "psrad $15, %%mm2 \n\t" - "psrad $15, %%mm1 \n\t" - "psrad $15, %%mm4 \n\t" + "psrad $9, %%mm0 \n\t" + "psrad $9, %%mm2 \n\t" + "psrad $9, %%mm1 \n\t" + "psrad $9, %%mm4 \n\t" "packssdw %%mm1, %%mm0 \n\t" "packssdw %%mm4, %%mm2 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm2, %%mm2 \n\t" - "movd %%mm0, (%1, %%"REG_a") \n\t" - "movd %%mm2, (%2, %%"REG_a") \n\t" - "add $4, %%"REG_a" \n\t" + "movq %%mm0, (%1, %%"REG_a") \n\t" + "movq %%mm2, (%2, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" " js 1b \n\t" : "+r" (src) - : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24]) + : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24]) : "%"REG_a ); } -static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, +static void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { @@ -1849,7 +1911,7 @@ static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, assert(src1 == src2); } -static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, +static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV, const uint8_t *src1, const uint8_t *src2, int width, uint32_t *unused) { @@ -1859,7 +1921,7 @@ static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, #if !COMPILE_TEMPLATE_MMX2 // bilinear / bicubic scaling -static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW, +static void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int16_t *filterPos, int filterSize) { @@ -2015,6 +2077,163 @@ static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW, } #endif /* !COMPILE_TEMPLATE_MMX2 */ +static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc, + const int16_t *filter, const int16_t *filterPos, long filterSize, int shift) +{ + int i, j; + + assert(filterSize % 4 == 0 && filterSize>0); + if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too. + x86_reg counter= -2*dstW; + filter-= counter*2; + filterPos-= counter/2; + dst-= counter/2; + __asm__ volatile( + "movd %5, %%mm7 \n\t" +#if defined(PIC) + "push %%"REG_b" \n\t" +#endif + "push %%"REG_BP" \n\t" // we use 7 regs here ... + "mov %%"REG_a", %%"REG_BP" \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movzwl (%2, %%"REG_BP"), %%eax \n\t" + "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" + "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" + "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" + "movq (%3, %%"REG_a", 2), %%mm0 \n\t" + "movq (%3, %%"REG_b", 2), %%mm2 \n\t" + "pmaddwd %%mm1, %%mm0 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "movq %%mm0, %%mm4 \n\t" + "punpckldq %%mm3, %%mm0 \n\t" + "punpckhdq %%mm3, %%mm4 \n\t" + "paddd %%mm4, %%mm0 \n\t" + "psrad %%mm7, %%mm0 \n\t" + "packssdw %%mm0, %%mm0 \n\t" + "movd %%mm0, (%4, %%"REG_BP") \n\t" + "add $4, %%"REG_BP" \n\t" + " jnc 1b \n\t" + + "pop %%"REG_BP" \n\t" +#if defined(PIC) + "pop %%"REG_b" \n\t" +#endif + : "+a" (counter) + : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift) +#if !defined(PIC) + : "%"REG_b +#endif + ); + } else if (filterSize==8 && shift<15) { + x86_reg counter= -2*dstW; + filter-= counter*4; + filterPos-= counter/2; + dst-= counter/2; + __asm__ volatile( + "movd %5, %%mm7 \n\t" +#if defined(PIC) + "push %%"REG_b" \n\t" +#endif + "push %%"REG_BP" \n\t" // we use 7 regs here ... + "mov %%"REG_a", %%"REG_BP" \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movzwl (%2, %%"REG_BP"), %%eax \n\t" + "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" + "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" + "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" + "movq (%3, %%"REG_a", 2), %%mm0 \n\t" + "movq (%3, %%"REG_b", 2), %%mm2 \n\t" + "pmaddwd %%mm1, %%mm0 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + + "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" + "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" + "movq 8(%3, %%"REG_a", 2), %%mm4 \n\t" + "movq 8(%3, %%"REG_b", 2), %%mm2 \n\t" + "pmaddwd %%mm1, %%mm4 \n\t" + "pmaddwd %%mm2, %%mm5 \n\t" + "paddd %%mm4, %%mm0 \n\t" + "paddd %%mm5, %%mm3 \n\t" + "movq %%mm0, %%mm4 \n\t" + "punpckldq %%mm3, %%mm0 \n\t" + "punpckhdq %%mm3, %%mm4 \n\t" + "paddd %%mm4, %%mm0 \n\t" + "psrad %%mm7, %%mm0 \n\t" + "packssdw %%mm0, %%mm0 \n\t" + "movd %%mm0, (%4, %%"REG_BP") \n\t" + "add $4, %%"REG_BP" \n\t" + " jnc 1b \n\t" + + "pop %%"REG_BP" \n\t" +#if defined(PIC) + "pop %%"REG_b" \n\t" +#endif + : "+a" (counter) + : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift) +#if !defined(PIC) + : "%"REG_b +#endif + ); + } else if (shift<15){ + const uint16_t *offset = src+filterSize; + x86_reg counter= -2*dstW; + //filter-= counter*filterSize/2; + filterPos-= counter/2; + dst-= counter/2; + __asm__ volatile( + "movd %7, %%mm7 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "mov %2, %%"REG_c" \n\t" + "movzwl (%%"REG_c", %0), %%eax \n\t" + "movzwl 2(%%"REG_c", %0), %%edx \n\t" + "mov %5, %%"REG_c" \n\t" + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" + "2: \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1, %6), %%mm3 \n\t" + "movq (%%"REG_c", %%"REG_a", 2), %%mm0 \n\t" + "movq (%%"REG_c", %%"REG_d", 2), %%mm2 \n\t" + "pmaddwd %%mm1, %%mm0 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "paddd %%mm3, %%mm5 \n\t" + "paddd %%mm0, %%mm4 \n\t" + "add $8, %1 \n\t" + "add $8, %%"REG_c" \n\t" + "cmp %4, %%"REG_c" \n\t" + " jb 2b \n\t" + "add %6, %1 \n\t" + "movq %%mm4, %%mm0 \n\t" + "punpckldq %%mm5, %%mm4 \n\t" + "punpckhdq %%mm5, %%mm0 \n\t" + "paddd %%mm0, %%mm4 \n\t" + "psrad %%mm7, %%mm4 \n\t" + "packssdw %%mm4, %%mm4 \n\t" + "mov %3, %%"REG_a" \n\t" + "movd %%mm4, (%%"REG_a", %0) \n\t" + "add $4, %0 \n\t" + " jnc 1b \n\t" + + : "+r" (counter), "+r" (filter) + : "m" (filterPos), "m" (dst), "m"(offset), + "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift) + : "%"REG_a, "%"REG_c, "%"REG_d + ); + } else + for (i=0; i<dstW; i++) { + int srcPos= filterPos[i]; + int val=0; + for (j=0; j<filterSize; j++) { + val += ((int)src[srcPos + j])*filter[filterSize*i + j]; + } + dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ... + } +} + + #if COMPILE_TEMPLATE_MMX2 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, @@ -2156,9 +2375,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) enum PixelFormat srcFormat = c->srcFormat, dstFormat = c->dstFormat; - if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && - dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) { - if (!(c->flags & SWS_BITEXACT)) { + if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12 + && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { if (c->flags & SWS_ACCURATE_RND) { c->yuv2yuv1 = RENAME(yuv2yuv1_ar ); c->yuv2yuvX = RENAME(yuv2yuvX_ar ); @@ -2173,7 +2391,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } } } else { - c->yuv2yuv1 = RENAME(yuv2yuv1 ); + int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat); + c->yuv2yuv1 = should_dither ? RENAME(yuv2yuv1_ar ) : RENAME(yuv2yuv1 ); c->yuv2yuvX = RENAME(yuv2yuvX ); if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { @@ -2186,7 +2405,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } } } - } if (!(c->flags & SWS_FULL_CHR_H_INT)) { switch (c->dstFormat) { case PIX_FMT_RGB32: @@ -2215,7 +2433,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } } - if (c->scalingBpp == 8) { #if !COMPILE_TEMPLATE_MMX2 c->hScale = RENAME(hScale ); #endif /* !COMPILE_TEMPLATE_MMX2 */ @@ -2233,7 +2450,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) #if COMPILE_TEMPLATE_MMX2 } #endif /* COMPILE_TEMPLATE_MMX2 */ - } #if !COMPILE_TEMPLATE_MMX2 switch(srcFormat) { @@ -2241,7 +2457,13 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break; case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break; case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break; - default: break; + case PIX_FMT_GRAY16LE : + case PIX_FMT_YUV420P9LE: + case PIX_FMT_YUV422P10LE: + case PIX_FMT_YUV420P10LE: + case PIX_FMT_YUV420P16LE: + case PIX_FMT_YUV422P16LE: + case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break; } #endif /* !COMPILE_TEMPLATE_MMX2 */ if (!c->chrSrcHSubSample) { @@ -2255,8 +2477,10 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) switch (srcFormat) { #if !COMPILE_TEMPLATE_MMX2 case PIX_FMT_YUYV422 : - case PIX_FMT_Y400A : c->lumToYV12 = RENAME(yuy2ToY); break; - case PIX_FMT_UYVY422 : c->lumToYV12 = RENAME(uyvyToY); break; + case PIX_FMT_Y400A : + c->lumToYV12 = RENAME(yuy2ToY); break; + case PIX_FMT_UYVY422 : + c->lumToYV12 = RENAME(uyvyToY); break; #endif /* !COMPILE_TEMPLATE_MMX2 */ case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break; case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break; @@ -2270,4 +2494,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) } } #endif /* !COMPILE_TEMPLATE_MMX2 */ + if(isAnyRGB(c->srcFormat)) + c->hScale16= RENAME(hScale16); } |