summaryrefslogtreecommitdiff
path: root/libswscale/x86/swscale_template.c
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2011-07-05 12:49:11 -0700
committerRonald S. Bultje <rsbultje@gmail.com>2011-07-08 15:15:30 -0700
commit4e3e333a79272944b40695166438359b376d7864 (patch)
treee7e89603def69ffe3372e2257c6f6c8f573614ab /libswscale/x86/swscale_template.c
parent7d7bacf0f1f43f5cc112977fc3b8438e3e4a4a9b (diff)
swscale: error dithering for 16/9/10-bit to 8-bit.
Based on a somewhat similar idea in FFmpeg's swscale copy.
Diffstat (limited to 'libswscale/x86/swscale_template.c')
-rw-r--r--libswscale/x86/swscale_template.c135
1 files changed, 118 insertions, 17 deletions
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 26cd2742a3..fd6ec3a793 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -37,8 +37,8 @@
#define YSCALEYUV2YV12X(offset, dest, end, pos) \
__asm__ volatile(\
- "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
- "movq %%mm3, %%mm4 \n\t"\
+ "movq "DITHER16"+0(%0), %%mm3 \n\t"\
+ "movq "DITHER16"+8(%0), %%mm4 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
".p2align 4 \n\t" /* FIXME Unroll? */\
@@ -60,8 +60,8 @@
MOVNTQ(%%mm3, (%1, %3))\
"add $8, %3 \n\t"\
"cmp %2, %3 \n\t"\
- "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
- "movq %%mm3, %%mm4 \n\t"\
+ "movq "DITHER16"+0(%0), %%mm3 \n\t"\
+ "movq "DITHER16"+8(%0), %%mm4 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"\
@@ -70,6 +70,42 @@
: "%"REG_d, "%"REG_S\
);
+#if !COMPILE_TEMPLATE_MMX2
+static av_always_inline void
+dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
+{
+ if (rot) {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "psrlq $24, %%mm3\n\t"
+ "psllq $40, %%mm4\n\t"
+ "por %%mm4, %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ "psraw $4, %%mm3\n\t"
+ "psraw $4, %%mm4\n\t"
+ "movq %%mm3, "DITHER16"+0(%1)\n\t"
+ "movq %%mm4, "DITHER16"+8(%1)\n\t"
+ :: "r"(srcDither), "r"(&c->redDither)
+ );
+ } else {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm3\n\t"
+ "movq %%mm3, %%mm4\n\t"
+ "punpcklbw %%mm0, %%mm3\n\t"
+ "punpckhbw %%mm0, %%mm4\n\t"
+ "psraw $4, %%mm3\n\t"
+ "psraw $4, %%mm4\n\t"
+ "movq %%mm3, "DITHER16"+0(%1)\n\t"
+ "movq %%mm4, "DITHER16"+8(%1)\n\t"
+ :: "r"(srcDither), "r"(&c->redDither)
+ );
+ }
+}
+#endif
+
static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
@@ -79,12 +115,16 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
{
uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
*aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
+ const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
if (uDest) {
x86_reg uv_off = c->uv_offx2 >> 1;
+ dither_8to16(c, chrDither, 0);
YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
+ dither_8to16(c, chrDither, 1);
YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
}
+ dither_8to16(c, lumDither, 0);
if (CONFIG_SWSCALE_ALPHA && aDest) {
YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
}
@@ -95,10 +135,10 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
#define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
__asm__ volatile(\
"lea " offset "(%0), %%"REG_d" \n\t"\
- "pxor %%mm4, %%mm4 \n\t"\
- "pxor %%mm5, %%mm5 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "pxor %%mm7, %%mm7 \n\t"\
+ "movq "DITHER32"+0(%0), %%mm4 \n\t"\
+ "movq "DITHER32"+8(%0), %%mm5 \n\t"\
+ "movq "DITHER32"+16(%0), %%mm6 \n\t"\
+ "movq "DITHER32"+24(%0), %%mm7 \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
@@ -142,10 +182,10 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
"add $8, %3 \n\t"\
"cmp %2, %3 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\
- "pxor %%mm4, %%mm4 \n\t"\
- "pxor %%mm5, %%mm5 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "pxor %%mm7, %%mm7 \n\t"\
+ "movq "DITHER32"+0(%0), %%mm4 \n\t"\
+ "movq "DITHER32"+8(%0), %%mm5 \n\t"\
+ "movq "DITHER32"+16(%0), %%mm6 \n\t"\
+ "movq "DITHER32"+24(%0), %%mm7 \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"\
:: "r" (&c->redDither),\
@@ -153,6 +193,62 @@ static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
: "%"REG_a, "%"REG_d, "%"REG_S\
);
+#if !COMPILE_TEMPLATE_MMX2
+static av_always_inline void
+dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot)
+{
+ if (rot) {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm4\n\t"
+ "movq %%mm4, %%mm5\n\t"
+ "psrlq $24, %%mm4\n\t"
+ "psllq $40, %%mm5\n\t"
+ "por %%mm5, %%mm4\n\t"
+ "movq %%mm4, %%mm6\n\t"
+ "punpcklbw %%mm0, %%mm4\n\t"
+ "punpckhbw %%mm0, %%mm6\n\t"
+ "movq %%mm4, %%mm5\n\t"
+ "movq %%mm6, %%mm7\n\t"
+ "punpcklwd %%mm0, %%mm4\n\t"
+ "punpckhwd %%mm0, %%mm5\n\t"
+ "punpcklwd %%mm0, %%mm6\n\t"
+ "punpckhwd %%mm0, %%mm7\n\t"
+ "psllw $12, %%mm4\n\t"
+ "psllw $12, %%mm5\n\t"
+ "psllw $12, %%mm6\n\t"
+ "psllw $12, %%mm7\n\t"
+ "movq %%mm3, "DITHER32"+0(%1)\n\t"
+ "movq %%mm4, "DITHER32"+8(%1)\n\t"
+ "movq %%mm4, "DITHER32"+16(%1)\n\t"
+ "movq %%mm4, "DITHER32"+24(%1)\n\t"
+ :: "r"(srcDither), "r"(&c->redDither)
+ );
+ } else {
+ __asm__ volatile("pxor %%mm0, %%mm0\n\t"
+ "movq (%0), %%mm4\n\t"
+ "movq %%mm4, %%mm6\n\t"
+ "punpcklbw %%mm0, %%mm4\n\t"
+ "punpckhbw %%mm0, %%mm6\n\t"
+ "movq %%mm4, %%mm5\n\t"
+ "movq %%mm6, %%mm7\n\t"
+ "punpcklwd %%mm0, %%mm4\n\t"
+ "punpckhwd %%mm0, %%mm5\n\t"
+ "punpcklwd %%mm0, %%mm6\n\t"
+ "punpckhwd %%mm0, %%mm7\n\t"
+ "psllw $12, %%mm4\n\t"
+ "psllw $12, %%mm5\n\t"
+ "psllw $12, %%mm6\n\t"
+ "psllw $12, %%mm7\n\t"
+ "movq %%mm3, "DITHER32"+0(%1)\n\t"
+ "movq %%mm4, "DITHER32"+8(%1)\n\t"
+ "movq %%mm4, "DITHER32"+16(%1)\n\t"
+ "movq %%mm4, "DITHER32"+24(%1)\n\t"
+ :: "r"(srcDither), "r"(&c->redDither)
+ );
+ }
+}
+#endif
+
static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
@@ -162,12 +258,16 @@ static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
{
uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
*aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
+ const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
if (uDest) {
x86_reg uv_off = c->uv_offx2 >> 1;
+ dither_8to32(c, chrDither, 0);
YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
+ dither_8to32(c, chrDither, 1);
YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off)
}
+ dither_8to32(c, lumDither, 0);
if (CONFIG_SWSCALE_ALPHA && aDest) {
YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
}
@@ -220,19 +320,20 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
chrVSrc + chrDstW, alpSrc + dstW
};
x86_reg counter[4]= { dstW, chrDstW, chrDstW, dstW };
+ const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
while (p--) {
if (dst[p]) {
+ dither_8to16(c, (p == 2 || p == 3) ? chrDither : lumDither, p == 2);
__asm__ volatile(
"mov %2, %%"REG_a" \n\t"
- "pcmpeqw %%mm7, %%mm7 \n\t"
- "psrlw $15, %%mm7 \n\t"
- "psllw $6, %%mm7 \n\t"
+ "movq "DITHER16"+0(%3), %%mm6 \n\t"
+ "movq "DITHER16"+8(%3), %%mm7 \n\t"
".p2align 4 \n\t" /* FIXME Unroll? */
"1: \n\t"
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"
"movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"
- "paddsw %%mm7, %%mm0 \n\t"
+ "paddsw %%mm6, %%mm0 \n\t"
"paddsw %%mm7, %%mm1 \n\t"
"psraw $7, %%mm0 \n\t"
"psraw $7, %%mm1 \n\t"
@@ -241,7 +342,7 @@ static void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
"add $8, %%"REG_a" \n\t"
"jnc 1b \n\t"
:: "r" (src[p]), "r" (dst[p] + counter[p]),
- "g" (-counter[p])
+ "g" (-counter[p]), "r"(&c->redDither)
: "%"REG_a
);
}