Merge remote-tracking branch 'qatar/master'

* qatar/master: swscale: add dithering to yuv2yuvX_altivec_real rv34: free+allocate buffer instead of reallocating it to preserve alignment h264: add missing brackets. swscale: use 15-bit intermediates for 9/10-bit scaling. Merged-by: Michael Niedermayer <michaelni@gmx.at>
author: Michael Niedermayer <michaelni@gmx.at> 2011-08-13 22:23:40 +0200
committer: Michael Niedermayer <michaelni@gmx.at> 2011-08-13 22:24:47 +0200
commit: ca1dfea12771b585846fb86aa08c3d7f066a3cc4 (patch)
tree: 6db034b36245b9fe51d9a41d41be1718a3be8b89 /libswscale
parent: 75af0e6a1601a4246d6409ca28dc80a3ba0e8d6e (diff)
parent: 3304a1e69a8a050eb66d2304acd2d01354fa1aac (diff)
5 files changed, 246 insertions, 170 deletions
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 8bc0ddd9d8..8a5bac308e 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -92,6 +92,7 @@ altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW)
     }
 }
 
+//FIXME remove the usage of scratch buffers.
 static void
 yuv2yuvX_altivec_real(SwsContext *c,
                       const int16_t *lumFilter, const int16_t **lumSrc,
@@ -101,17 +102,13 @@ yuv2yuvX_altivec_real(SwsContext *c,
                       uint8_t *dest[4], int dstW, int chrDstW)
 {
     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2];
-    const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)};
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
     register int i, j;
     {
         DECLARE_ALIGNED(16, int, val)[dstW];
 
-        for (i = 0; i < (dstW -7); i+=4) {
-            vec_st(vini, i << 2, val);
-        }
-        for (; i < dstW; i++) {
-            val[i] = (1 << 18);
-        }
+        for (i=0; i<dstW; i++)
+            val[i] = lumDither[i & 7] << 12;
 
         for (j = 0; j < lumFilterSize; j++) {
             vector signed short l1, vLumFilter = vec_ld(j << 1, lumFilter);
@@ -155,13 +152,9 @@ yuv2yuvX_altivec_real(SwsContext *c,
         DECLARE_ALIGNED(16, int, u)[chrDstW];
         DECLARE_ALIGNED(16, int, v)[chrDstW];
 
-        for (i = 0; i < (chrDstW -7); i+=4) {
-            vec_st(vini, i << 2, u);
-            vec_st(vini, i << 2, v);
-        }
-        for (; i < chrDstW; i++) {
-            u[i] = (1 << 18);
-            v[i] = (1 << 18);
+        for (i=0; i<chrDstW; i++) {
+            u[i] = chrDither[i & 7] << 12;
+            v[i] = chrDither[(i + 3) & 7] << 12;
         }
 
         for (j = 0; j < chrFilterSize; j++) {
@@ -406,7 +399,7 @@ void ff_sws_init_swScale_altivec(SwsContext *c)
     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
         return;
 
-    if (c->scalingBpp == 8) {
+    if (c->srcBpc == 8 && c->dstBpc <= 10) {
         c->hScale       = hScale_altivec_real;
     }
     if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 9d17868019..9897f2fd89 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -307,17 +307,9 @@ yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
 
 #define output_pixel(pos, val) \
     if (big_endian) { \
-        if (output_bits == 16) { \
-            AV_WB16(pos, av_clip_uint16(val >> shift)); \
-        } else { \
-            AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
-        } \
+        AV_WB16(pos, av_clip_uint16(val >> shift)); \
     } else { \
-        if (output_bits == 16) { \
-            AV_WL16(pos, av_clip_uint16(val >> shift)); \
-        } else { \
-            AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
-        } \
+        AV_WL16(pos, av_clip_uint16(val >> shift)); \
     }
     for (i = 0; i < dstW; i++) {
         int val = 1 << (26-output_bits + 4*dword - 1);
@@ -359,7 +351,67 @@ yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
 #undef output_pixel
 }
 
-#define yuv2NBPS(bits, BE_LE, is_be) \
+static av_always_inline void
+yuv2yuvX10_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
+                      int lumFilterSize, const int16_t *chrFilter,
+                      const int16_t **chrUSrc, const int16_t **chrVSrc,
+                      int chrFilterSize, const int16_t **alpSrc,
+                      uint16_t *dest[4], int dstW, int chrDstW,
+                      int big_endian, int output_bits)
+{
+    //FIXME Optimize (just quickly written not optimized..)
+    int i;
+    uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
+             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
+    int shift = 11 + 16 - output_bits - 1;
+
+#define output_pixel(pos, val) \
+    if (big_endian) { \
+        AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
+    } else { \
+        AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
+    }
+    for (i = 0; i < dstW; i++) {
+        int val = 1 << (26-output_bits - 1);
+        int j;
+
+        for (j = 0; j < lumFilterSize; j++)
+            val += (lumSrc[j][i] * lumFilter[j]) >> 1;
+
+        output_pixel(&yDest[i], val);
+    }
+
+    if (uDest) {
+        for (i = 0; i < chrDstW; i++) {
+            int u = 1 << (26-output_bits - 1);
+            int v = 1 << (26-output_bits - 1);
+            int j;
+
+            for (j = 0; j < chrFilterSize; j++) {
+                u += (chrUSrc[j][i] * chrFilter[j]) >> 1;
+                v += (chrVSrc[j][i] * chrFilter[j]) >> 1;
+            }
+
+            output_pixel(&uDest[i], u);
+            output_pixel(&vDest[i], v);
+        }
+    }
+
+    if (CONFIG_SWSCALE_ALPHA && aDest) {
+        for (i = 0; i < dstW; i++) {
+            int val = 1 << (26-output_bits - 1);
+            int j;
+
+            for (j = 0; j < lumFilterSize; j++)
+                val += (alpSrc[j][i] * lumFilter[j]) >> 1;
+
+            output_pixel(&aDest[i], val);
+        }
+    }
+#undef output_pixel
+}
+
+#define yuv2NBPS(bits, BE_LE, is_be, yuv2yuvX_template_fn, typeX_t) \
 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
                               const int16_t **_lumSrc, int lumFilterSize, \
                               const int16_t *chrFilter, const int16_t **_chrUSrc, \
@@ -367,21 +419,21 @@ static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFil
                               int chrFilterSize, const int16_t **_alpSrc, \
                               uint8_t *_dest[4], int dstW, int chrDstW) \
 { \
-    const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
-                  **chrUSrc = (const int32_t **) _chrUSrc, \
-                  **chrVSrc = (const int32_t **) _chrVSrc, \
-                  **alpSrc  = (const int32_t **) _alpSrc; \
-    yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
-                          chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
-                          alpSrc, (uint16_t **) _dest, \
-                          dstW, chrDstW, is_be, bits); \
-}
-yuv2NBPS( 9, BE, 1);
-yuv2NBPS( 9, LE, 0);
-yuv2NBPS(10, BE, 1);
-yuv2NBPS(10, LE, 0);
-yuv2NBPS(16, BE, 1);
-yuv2NBPS(16, LE, 0);
+    const typeX_t **lumSrc  = (const typeX_t **) _lumSrc, \
+                  **chrUSrc = (const typeX_t **) _chrUSrc, \
+                  **chrVSrc = (const typeX_t **) _chrVSrc, \
+                  **alpSrc  = (const typeX_t **) _alpSrc; \
+    yuv2yuvX_template_fn(lumFilter, lumSrc, lumFilterSize, \
+                         chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+                         alpSrc, (uint16_t **) _dest, \
+                         dstW, chrDstW, is_be, bits); \
+}
+yuv2NBPS( 9, BE, 1, yuv2yuvX10_c_template, int16_t);
+yuv2NBPS( 9, LE, 0, yuv2yuvX10_c_template, int16_t);
+yuv2NBPS(10, BE, 1, yuv2yuvX10_c_template, int16_t);
+yuv2NBPS(10, LE, 0, yuv2yuvX10_c_template, int16_t);
+yuv2NBPS(16, BE, 1, yuv2yuvX16_c_template, int32_t);
+yuv2NBPS(16, LE, 0, yuv2yuvX16_c_template, int32_t);
 
 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
                        const int16_t **lumSrc, int lumFilterSize,
@@ -1971,15 +2023,15 @@ static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
     }
 }
 
-static void hScale16_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
-                       const int16_t *filter,
-                       const int16_t *filterPos, int filterSize)
+static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
+                           const int16_t *filter,
+                           const int16_t *filterPos, int filterSize)
 {
     int i;
     int32_t *dst = (int32_t *) _dst;
     const uint16_t *src = (const uint16_t *) _src;
     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
-    int sh = (bits <= 7) ? 11 : (bits - 4);
+    int sh = bits - 4;
 
     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
         sh= 9;
@@ -1997,10 +2049,31 @@ static void hScale16_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_s
     }
 }
 
+static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
+                           const int16_t *filter,
+                           const int16_t *filterPos, int filterSize)
+{
+    int i;
+    const uint16_t *src = (const uint16_t *) _src;
+    int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
+
+    for (i = 0; i < dstW; i++) {
+        int j;
+        int srcPos = filterPos[i];
+        int val = 0;
+
+        for (j = 0; j < filterSize; j++) {
+            val += src[srcPos + j] * filter[filterSize * i + j];
+        }
+        // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
+        dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
+    }
+}
+
 // bilinear / bicubic scaling
-static void hScale_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
-                     const int16_t *filter, const int16_t *filterPos,
-                     int filterSize)
+static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
+                          const int16_t *filter, const int16_t *filterPos,
+                          int filterSize)
 {
     int i;
     for (i=0; i<dstW; i++) {
@@ -2045,6 +2118,25 @@ static inline void hScale16NX_c(int16_t *dst, int dstW, const uint16_t *src, int
     }
 }
 
+static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
+                          const int16_t *filter, const int16_t *filterPos,
+                          int filterSize)
+{
+    int i;
+    int32_t *dst = (int32_t *) _dst;
+    for (i=0; i<dstW; i++) {
+        int j;
+        int srcPos= filterPos[i];
+        int val=0;
+        for (j=0; j<filterSize; j++) {
+            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
+        }
+        //filter += hFilterSize;
+        dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
+        //dst[i] = val>>7;
+    }
+}
+
 //FIXME all pal and rgb srcFormats could do this convertion as well
 //FIXME all scalers more complex than bilinear could do half of this transform
 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
@@ -2126,23 +2218,6 @@ static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
         dst[i] = src[srcW-1]*128;
 }
 
-static void scale8To16Rv_c(uint16_t *_dst, const uint8_t *src, int len)
-{
-    int i;
-    uint8_t *dst = (uint8_t *) _dst;
-    for (i = len - 1; i >= 0; i--) {
-        dst[i * 2] = dst[i * 2 + 1] = src[i];
-    }
-}
-
-static void scale19To15Fw_c(int16_t *dst, const int32_t *src, int len)
-{
-    int i;
-    for (i = 0; i < len; i++) {
-        dst[i] = src[i] >> 4;
-    }
-}
-
 // *** horizontal scale Y line to temp buffer
 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
                                      const uint8_t *src, int srcW, int xInc,
@@ -2159,11 +2234,6 @@ static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
         src= formatConvBuffer;
     }
 
-    if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16 && !isAnyRGB(c->srcFormat)) {
-        c->scale8To16Rv((uint16_t *) formatConvBuffer, src, srcW);
-        src = formatConvBuffer;
-    }
-
     if (c->hScale16) {
         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
         c->hScale16(dst, dstWidth, (const uint16_t*)src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize, shift);
@@ -2175,10 +2245,6 @@ static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
 
     if (convertRange)
         convertRange(dst, dstWidth);
-
-    if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 15 && c->scalingBpp == 16) {
-        c->scale19To15Fw(dst, (int32_t *) dst, dstWidth);
-    }
 }
 
 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
@@ -2213,14 +2279,6 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2
         src2= buf2;
     }
 
-    if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16 && !isAnyRGB(c->srcFormat)) {
-        uint8_t *buf2 = (formatConvBuffer + FFALIGN(srcW * 2+78, 16));
-        c->scale8To16Rv((uint16_t *) formatConvBuffer, src1, srcW);
-        c->scale8To16Rv((uint16_t *) buf2,             src2, srcW);
-        src1 = formatConvBuffer;
-        src2 = buf2;
-    }
-
     if (c->hScale16) {
         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
         c->hScale16(dst1, dstWidth, (const uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
@@ -2234,11 +2292,6 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2
 
     if (c->chrConvertRange)
         c->chrConvertRange(dst1, dst2, dstWidth);
-
-    if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 15 && c->scalingBpp == 16) {
-        c->scale19To15Fw(dst1, (int32_t *) dst1, dstWidth);
-        c->scale19To15Fw(dst2, (int32_t *) dst2, dstWidth);
-    }
 }
 
 static av_always_inline void
@@ -2775,12 +2828,12 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
         case PIX_FMT_YUV420P9BE:
         case PIX_FMT_YUV444P10BE:
         case PIX_FMT_YUV422P10BE:
-        case PIX_FMT_YUV420P10BE: c->hScale16= HAVE_BIGENDIAN ? hScale16N_c : hScale16NX_c; break;
+        case PIX_FMT_YUV420P10BE: c->hScale16= HAVE_BIGENDIAN ? NULL : hScale16NX_c; break;
         case PIX_FMT_YUV444P9LE:
         case PIX_FMT_YUV420P9LE:
         case PIX_FMT_YUV422P10LE:
         case PIX_FMT_YUV420P10LE:
-        case PIX_FMT_YUV444P10LE: c->hScale16= HAVE_BIGENDIAN ? hScale16NX_c : hScale16N_c; break;
+        case PIX_FMT_YUV444P10LE: c->hScale16= HAVE_BIGENDIAN ? hScale16NX_c : NULL; break;
 #if HAVE_BIGENDIAN
         case PIX_FMT_YUV420P16LE:
         case PIX_FMT_YUV422P16LE:
@@ -2889,37 +2942,45 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
         }
     }
 
-    if((isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
-       || c->srcFormat == PIX_FMT_PAL8)
-        c->hScale16= hScale16N_c;
-
-    if (c->scalingBpp == 8) {
-    c->hScale       = hScale_c;
-    if (c->flags & SWS_FAST_BILINEAR) {
-        c->hyscale_fast = hyscale_fast_c;
-        c->hcscale_fast = hcscale_fast_c;
-    }
 
-    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
-        if (c->srcRange) {
-            c->lumConvertRange = lumRangeFromJpeg_c;
-            c->chrConvertRange = chrRangeFromJpeg_c;
+    if (c->srcBpc == 8) {
+        if (c->dstBpc <= 10) {
+            if((isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
+            || c->srcFormat == PIX_FMT_PAL8)
+                c->hScale16= hScale16N_c;
+            c->hScale       = hScale8To15_c;
+            if (c->flags & SWS_FAST_BILINEAR) {
+                c->hyscale_fast = hyscale_fast_c;
+                c->hcscale_fast = hcscale_fast_c;
+            }
         } else {
-            c->lumConvertRange = lumRangeToJpeg_c;
-            c->chrConvertRange = chrRangeToJpeg_c;
+            c->hScale = hScale8To19_c;
+            av_assert0(c->hScale16 != hScale16N_c && c->hScale16 != hScale16NX_c);
         }
-    }
     } else {
-        if(c->hScale16 == hScale16NX_c && !isAnyRGB(c->srcFormat)){
-            c->chrToYV12 = bswap16UV_c;
-            c->lumToYV12 = bswap16Y_c;
+        if(c->dstBpc > 10){
+            if((isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
+            || c->srcFormat == PIX_FMT_PAL8)
+                c->hScale16= hScale16N_c;
+            if(c->hScale16 == hScale16NX_c && !isAnyRGB(c->srcFormat)){
+                c->chrToYV12 = bswap16UV_c;
+                c->lumToYV12 = bswap16Y_c;
+            }
+            c->hScale16 = NULL;
         }
-        c->hScale16 = NULL;
-        c->hScale = hScale16_c;
-        c->scale19To15Fw = scale19To15Fw_c;
-        c->scale8To16Rv  = scale8To16Rv_c;
+        c->hScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
+    }
 
-        if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+        if (c->dstBpc <= 10) {
+            if (c->srcRange) {
+                c->lumConvertRange = lumRangeFromJpeg_c;
+                c->chrConvertRange = chrRangeFromJpeg_c;
+            } else {
+                c->lumConvertRange = lumRangeToJpeg_c;
+                c->chrConvertRange = chrRangeToJpeg_c;
+            }
+        } else {
             if (c->srcRange) {
                 c->lumConvertRange = lumRangeFromJpeg16_c;
                 c->chrConvertRange = chrRangeFromJpeg16_c;
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 30dec99130..a936f96373 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -66,11 +66,16 @@ typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t* src[],
  * without any additional vertical scaling (or point-scaling).
  *
  * @param c       SWS scaling context
- * @param lumSrc  scaled luma (Y) source data, 15bit for 8bit output
- * @param chrUSrc scaled chroma (U) source data, 15bit for 8bit output
- * @param chrVSrc scaled chroma (V) source data, 15bit for 8bit output
- * @param alpSrc  scaled alpha (A) source data, 15bit for 8bit output
- * @param dest    pointer to the 4 output planes (Y/U/V/A)
+ * @param lumSrc  scaled luma (Y) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param alpSrc  scaled alpha (A) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param dest    pointer to the 4 output planes (Y/U/V/A). For >8bit
+ *                output, this is in uint16_t
  * @param dstW    width of dest[0], dest[3], lumSrc and alpSrc in pixels
  * @param chrDstW width of dest[1], dest[2], chrUSrc and chrVSrc
  */
@@ -84,14 +89,19 @@ typedef void (*yuv2planar1_fn) (struct SwsContext *c,
  *
  * @param c             SWS scaling context
  * @param lumFilter     vertical luma/alpha scaling coefficients, 12bit [0,4096]
- * @param lumSrc        scaled luma (Y) source data, 15bit for 8bit output
+ * @param lumSrc        scaled luma (Y) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
  * @param lumFilterSize number of vertical luma/alpha input lines to scale
  * @param chrFilter     vertical chroma scaling coefficients, 12bit [0,4096]
- * @param chrUSrc       scaled chroma (U) source data, 15bit for 8bit output
- * @param chrVSrc       scaled chroma (V) source data, 15bit for 8bit output
+ * @param chrUSrc       scaled chroma (U) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param chrVSrc       scaled chroma (V) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
  * @param chrFilterSize number of vertical chroma input lines to scale
- * @param alpSrc        scaled alpha (A) source data, 15bit for 8bit output
- * @param dest          pointer to the 4 output planes (Y/U/V/A)
+ * @param alpSrc        scaled alpha (A) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param dest          pointer to the 4 output planes (Y/U/V/A). For >8bit
+ *                      output, this is in uint16_t
  * @param dstW          width of dest[0], dest[3], lumSrc and alpSrc in pixels
  * @param chrDstW       width of dest[1], dest[2], chrUSrc and chrVSrc
  */
@@ -107,11 +117,16 @@ typedef void (*yuv2planarX_fn) (struct SwsContext *c, const int16_t *lumFilter,
  * that this function may do chroma scaling, see the "uvalpha" argument.
  *
  * @param c       SWS scaling context
- * @param lumSrc  scaled luma (Y) source data, 15bit for 8bit output
- * @param chrUSrc scaled chroma (U) source data, 15bit for 8bit output
- * @param chrVSrc scaled chroma (V) source data, 15bit for 8bit output
- * @param alpSrc  scaled alpha (A) source data, 15bit for 8bit output
- * @param dest    pointer to the output plane
+ * @param lumSrc  scaled luma (Y) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param alpSrc  scaled alpha (A) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param dest    pointer to the output plane. For 16bit output, this is
+ *                uint16_t
  * @param dstW    width of lumSrc and alpSrc in pixels, number of pixels
  *                to write into dest[]
  * @param uvalpha chroma scaling coefficient for the second line of chroma
@@ -134,11 +149,16 @@ typedef void (*yuv2packed1_fn) (struct SwsContext *c,  const int16_t *lumSrc,
  * output by doing bilinear scaling between two input lines.
  *
  * @param c       SWS scaling context
- * @param lumSrc  scaled luma (Y) source data, 15bit for 8bit output
- * @param chrUSrc scaled chroma (U) source data, 15bit for 8bit output
- * @param chrVSrc scaled chroma (V) source data, 15bit for 8bit output
- * @param alpSrc  scaled alpha (A) source data, 15bit for 8bit output
- * @param dest    pointer to the output plane
+ * @param lumSrc  scaled luma (Y) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param alpSrc  scaled alpha (A) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param dest    pointer to the output plane. For 16bit output, this is
+ *                uint16_t
  * @param dstW    width of lumSrc and alpSrc in pixels, number of pixels
  *                to write into dest[]
  * @param yalpha  luma/alpha scaling coefficients for the second input line.
@@ -162,14 +182,19 @@ typedef void (*yuv2packed2_fn) (struct SwsContext *c,  const int16_t *lumSrc[2],
  *
  * @param c             SWS scaling context
  * @param lumFilter     vertical luma/alpha scaling coefficients, 12bit [0,4096]
- * @param lumSrc        scaled luma (Y) source data, 15bit for 8bit output
+ * @param lumSrc        scaled luma (Y) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
  * @param lumFilterSize number of vertical luma/alpha input lines to scale
  * @param chrFilter     vertical chroma scaling coefficients, 12bit [0,4096]
- * @param chrUSrc       scaled chroma (U) source data, 15bit for 8bit output
- * @param chrVSrc       scaled chroma (V) source data, 15bit for 8bit output
+ * @param chrUSrc       scaled chroma (U) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param chrVSrc       scaled chroma (V) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
  * @param chrFilterSize number of vertical chroma input lines to scale
- * @param alpSrc        scaled alpha (A) source data, 15bit for 8bit output
- * @param dest          pointer to the output plane
+ * @param alpSrc        scaled alpha (A) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param dest          pointer to the output plane. For 16bit output, this is
+ *                      uint16_t
  * @param dstW          width of lumSrc and alpSrc in pixels, number of pixels
  *                      to write into dest[]
  * @param y             vertical line number for this output. This does not need
@@ -209,7 +234,7 @@ typedef struct SwsContext {
     enum PixelFormat srcFormat;   ///< Source      pixel format.
     int dstFormatBpp;             ///< Number of bits per pixel of the destination pixel format.
     int srcFormatBpp;             ///< Number of bits per pixel of the source      pixel format.
-    int scalingBpp;
+    int dstBpc, srcBpc;
     int chrSrcHSubSample;         ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in source      image.
     int chrSrcVSubSample;         ///< Binary logarithm of vertical   subsampling factor between luma/alpha and chroma planes in source      image.
     int chrDstHSubSample;         ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in destination image.
@@ -433,17 +458,19 @@ typedef struct SwsContext {
      * lines, to produce one (differently sized) line of output data.
      *
      * @param dst        pointer to destination buffer for horizontally scaled
-     *                   data. If the scaling depth (SwsContext->scalingBpp) is
-     *                   8, data will be 15bpp in 16bits (int16_t) width. If
-     *                   scaling depth is 16, data will be 19bpp in 32bpp
-     *                   (int32_t) width.
+     *                   data. If the number of bits per component of one
+     *                   destination pixel (SwsContext->dstBpc) is <= 10, data
+     *                   will be 15bpc in 16bits (int16_t) width. Else (i.e.
+     *                   SwsContext->dstBpc == 16), data will be 19bpc in
+     *                   32bits (int32_t) width.
      * @param dstW       width of destination image
-     * @param src        pointer to source data to be scaled. If scaling depth
-     *                   is 8, this is 8bpp in 8bpp (uint8_t) width. If scaling
-     *                   depth is 16, this is native depth in 16bbp (uint16_t)
-     *                   width. In other words, for 9-bit YUV input, this is
-     *                   9bpp, for 10-bit YUV input, this is 10bpp, and for
-     *                   16-bit RGB or YUV, this is 16bpp.
+     * @param src        pointer to source data to be scaled. If the number of
+     *                   bits per component of a source pixel (SwsContext->srcBpc)
+     *                   is 8, this is 8bpc in 8bits (uint8_t) width. Else
+     *                   (i.e. SwsContext->dstBpc > 8), this is native depth
+     *                   in 16bits (uint16_t) width. In other words, for 9-bit
+     *                   YUV input, this is 9bpc, for 10-bit YUV input, this is
+     *                   10bpc, and for 16-bit RGB or YUV, this is 16bpc.
      * @param filter     filter coefficients to be used per output pixel for
      *                   scaling. This contains 14bpp filtering coefficients.
      *                   Guaranteed to contain dstW * filterSize entries.
@@ -467,15 +494,6 @@ typedef struct SwsContext {
     void (*lumConvertRange)(int16_t *dst, int width); ///< Color range conversion function for luma plane if needed.
     void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width); ///< Color range conversion function for chroma planes if needed.
 
-    /**
-     * dst[..] = (src[..] << 8) | src[..];
-     */
-    void (*scale8To16Rv)(uint16_t *dst, const uint8_t *src, int len);
-    /**
-     * dst[..] = src[..] >> 4;
-     */
-    void (*scale19To15Fw)(int16_t *dst, const int32_t *src, int len);
-
     int needs_hcscale; ///< Set if there are chroma planes to be converted.
 
 } SwsContext;
diff --git a/libswscale/utils.c b/libswscale/utils.c
index d3451e4bbd..54a4c0d42f 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -842,14 +842,18 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         }
     }
 
-    c->scalingBpp = FFMAX(av_pix_fmt_descriptors[srcFormat].comp[0].depth_minus1,
-                          av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1) >= 15 ? 16 : 8;
-
-    if (c->scalingBpp == 16)
+    c->srcBpc = 1 + av_pix_fmt_descriptors[srcFormat].comp[0].depth_minus1;
+    if (c->srcBpc < 8)
+        c->srcBpc = 8;
+    c->dstBpc = 1 + av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1;
+    if (c->dstBpc < 8)
+        c->dstBpc = 8;
+    if (isAnyRGB(srcFormat) && c->dstBpc == 16)
+        c->srcBpc = 16;
+    if (c->dstBpc == 16)
         dst_stride <<= 1;
-    av_assert0(c->scalingBpp<=16);
     FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail);
-    if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2 && c->scalingBpp == 8) {
+    if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2 && c->srcBpc == 8 && c->dstBpc <= 10) {
         c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
         if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR)) {
             if (flags&SWS_PRINT_INFO)
@@ -875,7 +879,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             c->chrXInc+= 20;
         }
         //we don't use the x86 asm scaler if MMX is available
-        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX && c->scalingBpp == 8) {
+        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX && c->dstBpc <= 10) {
             c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
             c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
         }
@@ -1007,7 +1011,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         c->lumPixBuf[i] = c->lumPixBuf[i+c->vLumBufSize];
     }
     // 64 / c->scalingBpp is the same as 16 / sizeof(scaling_intermediate)
-    c->uv_off   = (dst_stride>>1) + 64 / c->scalingBpp;
+    c->uv_off   = (dst_stride>>1) + 64 / (c->dstBpc &~ 7);
     c->uv_offx2 = dst_stride + 16;
     for (i=0; i<c->vChrBufSize; i++) {
         FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i+c->vChrBufSize], dst_stride*2+32, fail);
@@ -1023,7 +1027,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     //try to avoid drawing green stuff between the right end and the stride end
     for (i=0; i<c->vChrBufSize; i++)
         if(av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 == 15){
-            av_assert0(c->scalingBpp == 16);
+            av_assert0(c->dstBpc > 10);
             for(j=0; j<dst_stride/2+1; j++)
                 ((int32_t*)(c->chrUPixBuf[i]))[j] = 1<<18;
         } else
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 87248e76fa..4163647fa0 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -2472,7 +2472,7 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
         }
     }
 
-    if (c->scalingBpp == 8) {
+    if (c->srcBpc == 8 && c->dstBpc <= 10) {
 #if !COMPILE_TEMPLATE_MMX2
     c->hScale       = RENAME(hScale      );
 #endif /* !COMPILE_TEMPLATE_MMX2 */
@@ -2532,6 +2532,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
 #endif /* !COMPILE_TEMPLATE_MMX2 */
     if(isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
         c->hScale16= RENAME(hScale16);
-    if(c->scalingBpp != 8)
+    if(c->dstBpc > 10)
         c->hScale16 = NULL;
 }
author	Michael Niedermayer <michaelni@gmx.at>	2011-08-13 22:23:40 +0200
committer	Michael Niedermayer <michaelni@gmx.at>	2011-08-13 22:24:47 +0200
commit	ca1dfea12771b585846fb86aa08c3d7f066a3cc4 (patch)
tree	6db034b36245b9fe51d9a41d41be1718a3be8b89 /libswscale
parent	75af0e6a1601a4246d6409ca28dc80a3ba0e8d6e (diff)
parent	3304a1e69a8a050eb66d2304acd2d01354fa1aac (diff)