From c8f487deae75d4f25c2ec39ab484c1075f909bbd Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 11 May 2011 19:15:14 -0400
Subject: swscale: fix YUV420P 9/10bit support.

Fix handling of input if not in native endianness, and add support for
9/10-bit output. This allows us to force endianness of YUV420P 9/10bit
in the H264/10bit fate tests, which should fix them on big-endian
systems.
---
 libswscale/swscale.c | 175 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 142 insertions(+), 33 deletions(-)

(limited to 'libswscale/swscale.c')

diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 2830f26ce5..b63a3868c5 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -212,10 +212,11 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
                                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
                                                     const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest,
-                                                    int dstW, int chrDstW, int big_endian)
+                                                    int dstW, int chrDstW, int big_endian, int output_bits)
 {
     //FIXME Optimize (just quickly written not optimized..)
     int i;
+    int shift = 11 + 16 - output_bits;
 
     for (i = 0; i < dstW; i++) {
         int val = 1 << 10;
@@ -225,9 +226,9 @@ static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, co
             val += lumSrc[j][i] * lumFilter[j];
 
         if (big_endian) {
-            AV_WB16(&dest[i], av_clip_uint16(val >> 11));
+            AV_WB16(&dest[i], av_clip_uint16(val >> shift));
         } else {
-            AV_WL16(&dest[i], av_clip_uint16(val >> 11));
+            AV_WL16(&dest[i], av_clip_uint16(val >> shift));
         }
     }
 
@@ -243,11 +244,11 @@ static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, co
             }
 
             if (big_endian) {
-                AV_WB16(&uDest[i], av_clip_uint16(u >> 11));
-                AV_WB16(&vDest[i], av_clip_uint16(v >> 11));
+                AV_WB16(&uDest[i], av_clip_uint16(u >> shift));
+                AV_WB16(&vDest[i], av_clip_uint16(v >> shift));
             } else {
-                AV_WL16(&uDest[i], av_clip_uint16(u >> 11));
-                AV_WL16(&vDest[i], av_clip_uint16(v >> 11));
+                AV_WL16(&uDest[i], av_clip_uint16(u >> shift));
+                AV_WL16(&vDest[i], av_clip_uint16(v >> shift));
             }
         }
     }
@@ -261,9 +262,9 @@ static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, co
                 val += alpSrc[j][i] * lumFilter[j];
 
             if (big_endian) {
-                AV_WB16(&aDest[i], av_clip_uint16(val >> 11));
+                AV_WB16(&aDest[i], av_clip_uint16(val >> shift));
             } else {
-                AV_WL16(&aDest[i], av_clip_uint16(val >> 11));
+                AV_WL16(&aDest[i], av_clip_uint16(val >> shift));
             }
         }
     }
@@ -274,19 +275,28 @@ static inline void yuv2yuvX16inC(const int16_t *lumFilter, const int16_t **lumSr
                                  const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest, int dstW, int chrDstW,
                                  enum PixelFormat dstFormat)
 {
-    if (isBE(dstFormat)) {
-        yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize,
-                               chrFilter, chrSrc, chrFilterSize,
-                               alpSrc,
-                               dest, uDest, vDest, aDest,
-                               dstW, chrDstW, 1);
+#define conv16(bits) \
+    if (isBE(dstFormat)) { \
+        yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize, \
+                               chrFilter, chrSrc, chrFilterSize, \
+                               alpSrc, \
+                               dest, uDest, vDest, aDest, \
+                               dstW, chrDstW, 1, bits); \
+    } else { \
+        yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize, \
+                               chrFilter, chrSrc, chrFilterSize, \
+                               alpSrc, \
+                               dest, uDest, vDest, aDest, \
+                               dstW, chrDstW, 0, bits); \
+    }
+    if (is16BPS(dstFormat)) {
+        conv16(16);
+    } else if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
+        conv16(9);
     } else {
-        yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize,
-                               chrFilter, chrSrc, chrFilterSize,
-                               alpSrc,
-                               dest, uDest, vDest, aDest,
-                               dstW, chrDstW, 0);
+        conv16(10);
     }
+#undef conv16
 }
 
 static inline void yuv2yuvXinC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
@@ -1669,25 +1679,124 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[
                 length*=2;
             fillPlane(dst[plane], dstStride[plane], length, height, y, (plane==3) ? 255 : 128);
         } else {
-            if(isNBPS(c->srcFormat)) {
-                const int depth = av_pix_fmt_descriptors[c->srcFormat].comp[plane].depth_minus1+1;
-                uint16_t *srcPtr2 = (uint16_t*)srcPtr;
+            if(is9_OR_10BPS(c->srcFormat)) {
+                const int src_depth = av_pix_fmt_descriptors[c->srcFormat].comp[plane].depth_minus1+1;
+                const int dst_depth = av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1+1;
+                const uint16_t *srcPtr2 = (const uint16_t*)srcPtr;
 
                 if (is16BPS(c->dstFormat)) {
                     uint16_t *dstPtr2 = (uint16_t*)dstPtr;
-                    for (i = 0; i < height; i++) {
-                        for (j = 0; j < length; j++)
-                            dstPtr2[j] = (srcPtr2[j]<<(16-depth)) | (srcPtr2[j]>>(2*depth-16));
-                        dstPtr2 += dstStride[plane]/2;
-                        srcPtr2 += srcStride[plane]/2;
+#define COPY9_OR_10TO16(rfunc, wfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            int srcpx = rfunc(&srcPtr2[j]); \
+                            wfunc(&dstPtr2[j], (srcpx<<(16-src_depth)) | (srcpx>>(2*src_depth-16))); \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO16(AV_RB16, AV_WB16);
+                        } else {
+                            COPY9_OR_10TO16(AV_RL16, AV_WB16);
+                        }
+                    } else {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO16(AV_RB16, AV_WL16);
+                        } else {
+                            COPY9_OR_10TO16(AV_RL16, AV_WL16);
+                        }
+                    }
+                } else if (is9_OR_10BPS(c->dstFormat)) {
+                    uint16_t *dstPtr2 = (uint16_t*)dstPtr;
+#define COPY9_OR_10TO9_OR_10(loop) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            loop; \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+#define COPY9_OR_10TO9_OR_10_2(rfunc, wfunc) \
+                    if (dst_depth > src_depth) { \
+                        COPY9_OR_10TO9_OR_10(int srcpx = rfunc(&srcPtr2[j]); \
+                            wfunc(&dstPtr2[j], (srcpx << 1) | (srcpx >> 9))); \
+                    } else if (dst_depth < src_depth) { \
+                        COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]) >> 1)); \
+                    } else { \
+                        COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]))); \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WB16);
+                        } else {
+                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WB16);
+                        }
+                    } else {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WL16);
+                        } else {
+                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WL16);
+                        }
                     }
                 } else {
                     // FIXME Maybe dither instead.
-                    for (i = 0; i < height; i++) {
-                        for (j = 0; j < length; j++)
-                            dstPtr[j] = srcPtr2[j]>>(depth-8);
-                        dstPtr  += dstStride[plane];
-                        srcPtr2 += srcStride[plane]/2;
+#define COPY9_OR_10TO8(rfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            dstPtr[j] = rfunc(&srcPtr2[j])>>(src_depth-8); \
+                        } \
+                        dstPtr  += dstStride[plane]; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+                    if (isBE(c->srcFormat)) {
+                        COPY9_OR_10TO8(AV_RB16);
+                    } else {
+                        COPY9_OR_10TO8(AV_RL16);
+                    }
+                }
+            } else if(is9_OR_10BPS(c->dstFormat)) {
+                const int dst_depth = av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1+1;
+                uint16_t *dstPtr2 = (uint16_t*)dstPtr;
+
+                if (is16BPS(c->srcFormat)) {
+                    const uint16_t *srcPtr2 = (const uint16_t*)srcPtr;
+#define COPY16TO9_OR_10(rfunc, wfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            wfunc(&dstPtr2[j], rfunc(&srcPtr2[j])>>(16-dst_depth)); \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        if (isBE(c->srcFormat)) {
+                            COPY16TO9_OR_10(AV_RB16, AV_WB16);
+                        } else {
+                            COPY16TO9_OR_10(AV_RL16, AV_WB16);
+                        }
+                    } else {
+                        if (isBE(c->srcFormat)) {
+                            COPY16TO9_OR_10(AV_RB16, AV_WL16);
+                        } else {
+                            COPY16TO9_OR_10(AV_RL16, AV_WL16);
+                        }
+                    }
+                } else /* 8bit */ {
+#define COPY8TO9_OR_10(wfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            const int srcpx = srcPtr[j]; \
+                            wfunc(&dstPtr2[j], (srcpx<<(dst_depth-8)) | (srcpx >> (16-dst_depth))); \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr  += srcStride[plane]; \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        COPY8TO9_OR_10(AV_WB16);
+                    } else {
+                        COPY8TO9_OR_10(AV_WL16);
                     }
                 }
             } else if(is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) {
-- 
cgit v1.2.3