34 files changed, 2011 insertions, 1245 deletions
diff --git a/libswscale/Makefile b/libswscale/Makefile
index 0799b458be..dd00f7d708 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -1,3 +1,5 @@
+include $(SUBDIR)../config.mak
+
 NAME = swscale
 FFLIBS = avutil
 
diff --git a/libswscale/bfin/internal_bfin.S b/libswscale/bfin/internal_bfin.S
index b007f07f53..eab30aa6ce 100644
--- a/libswscale/bfin/internal_bfin.S
+++ b/libswscale/bfin/internal_bfin.S
@@ -5,20 +5,20 @@
  * Blackfin video color space converter operations
  * convert I420 YV12 to RGB in various formats
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/bfin/swscale_bfin.c b/libswscale/bfin/swscale_bfin.c
index 9d0bbe3e2a..2b93858e5c 100644
--- a/libswscale/bfin/swscale_bfin.c
+++ b/libswscale/bfin/swscale_bfin.c
@@ -3,20 +3,20 @@
  *
  * Blackfin software video scaler operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/bfin/yuv2rgb_bfin.c b/libswscale/bfin/yuv2rgb_bfin.c
index 5b74c7a516..e1b7afaaa7 100644
--- a/libswscale/bfin/yuv2rgb_bfin.c
+++ b/libswscale/bfin/yuv2rgb_bfin.c
@@ -4,23 +4,24 @@
  * Blackfin video color space converter operations
  * convert I420 YV12 to RGB in various formats
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/pixdesc.h"
 #include <stdint.h>
 
 #include "config.h"
@@ -195,7 +196,7 @@ SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c)
     }
 
     av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
-           sws_format_name(c->dstFormat));
+           av_get_pix_fmt_name(c->dstFormat));
 
     return f;
 }
diff --git a/libswscale/colorspace-test.c b/libswscale/colorspace-test.c
index fbf595d4c1..42a915bfe6 100644
--- a/libswscale/colorspace-test.c
+++ b/libswscale/colorspace-test.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@
 
 #define FUNC(s, d, n) { s, d, #n, n }
 
-int main(void)
+int main(int argc, char **argv)
 {
     int i, funcNum;
     uint8_t *srcBuffer = av_malloc(SIZE);
@@ -54,6 +54,7 @@ int main(void)
             const char *name;
             void (*func)(const uint8_t *src, uint8_t *dst, int src_size);
         } func_info[] = {
+            FUNC(2, 2, rgb12to15),
             FUNC(2, 2, rgb15to16),
             FUNC(2, 3, rgb15to24),
             FUNC(2, 4, rgb15to32),
@@ -66,6 +67,7 @@ int main(void)
             FUNC(4, 2, rgb32to16),
             FUNC(4, 3, rgb32to24),
             FUNC(2, 2, rgb16to15),
+            FUNC(2, 2, rgb12tobgr12),
             FUNC(2, 2, rgb15tobgr15),
             FUNC(2, 2, rgb15tobgr16),
             FUNC(2, 3, rgb15tobgr24),
@@ -82,6 +84,12 @@ int main(void)
             FUNC(4, 2, rgb32tobgr16),
             FUNC(4, 3, rgb32tobgr24),
             FUNC(4, 4, shuffle_bytes_2103), /* rgb32tobgr32 */
+            FUNC(6, 6, rgb48tobgr48_nobswap),
+            FUNC(6, 6, rgb48tobgr48_bswap),
+            FUNC(8, 6, rgb64to48_nobswap),
+            FUNC(8, 6, rgb64to48_bswap),
+            FUNC(8, 6, rgb64tobgr48_nobswap),
+            FUNC(8, 6, rgb64tobgr48_bswap),
             FUNC(0, 0, NULL)
         };
         int width;
diff --git a/libswscale/input.c b/libswscale/input.c
index 142cc29a62..43a67aac90 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,6 +30,7 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
 #include "config.h"
 #include "rgb2rgb.h"
 #include "swscale.h"
@@ -51,6 +52,86 @@
 #define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE) ? b_r : r_b)
 #define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE) ? r_b : b_r)
 
+static av_always_inline void
+rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
+                    enum AVPixelFormat origin)
+{
+    int i;
+    for (i = 0; i < width; i++) {
+        unsigned int r_b = input_pixel(&src[i*4+0]);
+        unsigned int   g = input_pixel(&src[i*4+1]);
+        unsigned int b_r = input_pixel(&src[i*4+2]);
+
+        dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void
+rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
+                    const uint16_t *src1, const uint16_t *src2,
+                    int width, enum AVPixelFormat origin)
+{
+    int i;
+    av_assert1(src1==src2);
+    for (i = 0; i < width; i++) {
+        int r_b = input_pixel(&src1[i*4+0]);
+        int   g = input_pixel(&src1[i*4+1]);
+        int b_r = input_pixel(&src1[i*4+2]);
+
+        dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void
+rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
+                          const uint16_t *src1, const uint16_t *src2,
+                          int width, enum AVPixelFormat origin)
+{
+    int i;
+    av_assert1(src1==src2);
+    for (i = 0; i < width; i++) {
+        int r_b = (input_pixel(&src1[8 * i + 0]) + input_pixel(&src1[8 * i + 4]) + 1) >> 1;
+        int   g = (input_pixel(&src1[8 * i + 1]) + input_pixel(&src1[8 * i + 5]) + 1) >> 1;
+        int b_r = (input_pixel(&src1[8 * i + 2]) + input_pixel(&src1[8 * i + 6]) + 1) >> 1;
+
+        dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+#define rgb64funcs(pattern, BE_LE, origin) \
+static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
+                                    int width, uint32_t *unused) \
+{ \
+    const uint16_t *src = (const uint16_t *) _src; \
+    uint16_t *dst = (uint16_t *) _dst; \
+    rgb64ToY_c_template(dst, src, width, origin); \
+} \
+ \
+static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
+                                    const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
+                                    int width, uint32_t *unused) \
+{ \
+    const uint16_t *src1 = (const uint16_t *) _src1, \
+                   *src2 = (const uint16_t *) _src2; \
+    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+    rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
+} \
+ \
+static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
+                                    const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
+                                    int width, uint32_t *unused) \
+{ \
+    const uint16_t *src1 = (const uint16_t *) _src1, \
+                   *src2 = (const uint16_t *) _src2; \
+    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+    rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
+}
+
+rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE)
+rgb64funcs(rgb, BE, AV_PIX_FMT_RGBA64BE)
+
 static av_always_inline void rgb48ToY_c_template(uint16_t *dst,
                                                  const uint16_t *src, int width,
                                                  enum AVPixelFormat origin)
@@ -73,7 +154,7 @@ static av_always_inline void rgb48ToUV_c_template(uint16_t *dstU,
                                                   enum AVPixelFormat origin)
 {
     int i;
-    assert(src1 == src2);
+    av_assert1(src1 == src2);
     for (i = 0; i < width; i++) {
         int r_b = input_pixel(&src1[i * 3 + 0]);
         int g   = input_pixel(&src1[i * 3 + 1]);
@@ -92,7 +173,7 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
                                                        enum AVPixelFormat origin)
 {
     int i;
-    assert(src1 == src2);
+    av_assert1(src1 == src2);
     for (i = 0; i < width; i++) {
         int r_b = (input_pixel(&src1[6 * i + 0]) +
                    input_pixel(&src1[6 * i + 3]) + 1) >> 1;
@@ -113,6 +194,7 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
 #define rgb48funcs(pattern, BE_LE, origin)                              \
 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst,              \
                                             const uint8_t *_src,        \
+                                            const uint8_t *unused0, const uint8_t *unused1,\
                                             int width,                  \
                                             uint32_t *unused)           \
 {                                                                       \
@@ -123,6 +205,7 @@ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst,              \
                                                                         \
 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU,            \
                                              uint8_t *_dstV,            \
+                                             const uint8_t *unused0,    \
                                              const uint8_t *_src1,      \
                                              const uint8_t *_src2,      \
                                              int width,                 \
@@ -137,6 +220,7 @@ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU,            \
                                                                         \
 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU,       \
                                                   uint8_t *_dstV,       \
+                                                  const uint8_t *unused0,    \
                                                   const uint8_t *_src1, \
                                                   const uint8_t *_src2, \
                                                   int width,            \
@@ -162,7 +246,7 @@ rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE)
                         : (isBE(origin) ? AV_RB16(&src[(i) * 2])        \
                                         : AV_RL16(&src[(i) * 2])))
 
-static av_always_inline void rgb16_32ToY_c_template(uint8_t *dst,
+static av_always_inline void rgb16_32ToY_c_template(int16_t *dst,
                                                     const uint8_t *src,
                                                     int width,
                                                     enum AVPixelFormat origin,
@@ -173,7 +257,7 @@ static av_always_inline void rgb16_32ToY_c_template(uint8_t *dst,
                                                     int gsh, int bsh, int S)
 {
     const int ry       = RY << rsh, gy = GY << gsh, by = BY << bsh;
-    const unsigned rnd = 33u << (S - 1);
+    const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
     int i;
 
     for (i = 0; i < width; i++) {
@@ -182,12 +266,12 @@ static av_always_inline void rgb16_32ToY_c_template(uint8_t *dst,
         int g  = (px & maskg) >> shg;
         int r  = (px & maskr) >> shr;
 
-        dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
+        dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
     }
 }
 
-static av_always_inline void rgb16_32ToUV_c_template(uint8_t *dstU,
-                                                     uint8_t *dstV,
+static av_always_inline void rgb16_32ToUV_c_template(int16_t *dstU,
+                                                     int16_t *dstV,
                                                      const uint8_t *src,
                                                      int width,
                                                      enum AVPixelFormat origin,
@@ -199,7 +283,7 @@ static av_always_inline void rgb16_32ToUV_c_template(uint8_t *dstU,
 {
     const int ru       = RU << rsh, gu = GU << gsh, bu = BU << bsh,
               rv       = RV << rsh, gv = GV << gsh, bv = BV << bsh;
-    const unsigned rnd = 257u << (S - 1);
+    const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
     int i;
 
     for (i = 0; i < width; i++) {
@@ -208,13 +292,13 @@ static av_always_inline void rgb16_32ToUV_c_template(uint8_t *dstU,
         int g  = (px & maskg)   >> shg;
         int r  = (px & maskr)   >> shr;
 
-        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
-        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
+        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
+        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
     }
 }
 
-static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU,
-                                                          uint8_t *dstV,
+static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
+                                                          int16_t *dstV,
                                                           const uint8_t *src,
                                                           int width,
                                                           enum AVPixelFormat origin,
@@ -227,7 +311,7 @@ static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU,
     const int ru       = RU << rsh, gu = GU << gsh, bu = BU << bsh,
               rv       = RV << rsh, gv = GV << gsh, bv = BV << bsh,
               maskgx   = ~(maskr | maskb);
-    const unsigned rnd = 257u << S;
+    const unsigned rnd = (256U<<(S)) + (1<<(S-6));
     int i;
 
     maskr |= maskr << 1;
@@ -249,8 +333,8 @@ static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU,
         }
         r = (rb & maskr) >> shr;
 
-        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
-        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
+        dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
+        dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
     }
 }
 
@@ -258,28 +342,28 @@ static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU,
 
 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr,          \
                          maskg, maskb, rsh, gsh, bsh, S)                \
-static void name ## ToY_c(uint8_t *dst, const uint8_t *src,             \
+static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,            \
                           int width, uint32_t *unused)                  \
 {                                                                       \
-    rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp,    \
+    rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, shp,    \
                            maskr, maskg, maskb, rsh, gsh, bsh, S);      \
 }                                                                       \
                                                                         \
 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV,                \
-                           const uint8_t *src, const uint8_t *dummy,    \
+                           const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy,    \
                            int width, uint32_t *unused)                 \
 {                                                                       \
-    rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt,                \
+    rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,                \
                             shr, shg, shb, shp,                         \
                             maskr, maskg, maskb, rsh, gsh, bsh, S);     \
 }                                                                       \
                                                                         \
 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV,           \
-                                const uint8_t *src,                     \
+                                const uint8_t *unused0, const uint8_t *src,                     \
                                 const uint8_t *dummy,                   \
                                 int width, uint32_t *unused)            \
 {                                                                       \
-    rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt,           \
+    rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,           \
                                  shr, shg, shb, shp,                    \
                                  maskr, maskg, maskb,                   \
                                  rsh, gsh, bsh, S);                     \
@@ -302,71 +386,124 @@ rgb16_32_wrapper(AV_PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,
 rgb16_32_wrapper(AV_PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
 rgb16_32_wrapper(AV_PIX_FMT_RGB444BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
 
-static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width,
-                      uint32_t *unused)
+static void gbr24pToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
+                         const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
+                         int width, uint32_t *unused)
 {
+    uint16_t *dstU = (uint16_t *)_dstU;
+    uint16_t *dstV = (uint16_t *)_dstV;
     int i;
-    for (i = 0; i < width; i++)
-        dst[i] = src[4 * i];
+    for (i = 0; i < width; i++) {
+        unsigned int g   = gsrc[2*i] + gsrc[2*i+1];
+        unsigned int b   = bsrc[2*i] + bsrc[2*i+1];
+        unsigned int r   = rsrc[2*i] + rsrc[2*i+1];
+
+        dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+        dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+    }
 }
 
-static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width,
-                      uint32_t *unused)
+static void rgba64ToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
+                        const uint8_t *unused2, int width, uint32_t *unused)
 {
+    int16_t *dst = (int16_t *)_dst;
+    const uint16_t *src = (const uint16_t *)_src;
     int i;
     for (i = 0; i < width; i++)
         dst[i] = src[4 * i + 3];
 }
 
-static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
+static void abgrToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
 {
+    int16_t *dst = (int16_t *)_dst;
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i]= src[4*i]<<6;
+    }
+}
+
+static void rgbaToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+{
+    int16_t *dst = (int16_t *)_dst;
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i]= src[4*i+3]<<6;
+    }
+}
+
+static void palToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
+{
+    int16_t *dst = (int16_t *)_dst;
+    int i;
+    for (i=0; i<width; i++) {
+        int d= src[i];
+
+        dst[i]= (pal[d] >> 24)<<6;
+    }
+}
+
+static void palToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
+{
+    int16_t *dst = (int16_t *)_dst;
     int i;
     for (i = 0; i < width; i++) {
         int d = src[i];
 
-        dst[i] = pal[d] & 0xFF;
+        dst[i] = (pal[d] & 0xFF)<<6;
     }
 }
 
-static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
-                      const uint8_t *src1, const uint8_t *src2,
+static void palToUV_c(uint8_t *_dstU, uint8_t *_dstV,
+                           const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
                       int width, uint32_t *pal)
 {
+    uint16_t *dstU = (uint16_t *)_dstU;
+    int16_t *dstV = (int16_t *)_dstV;
     int i;
-    assert(src1 == src2);
+    av_assert1(src1 == src2);
     for (i = 0; i < width; i++) {
         int p = pal[src1[i]];
 
-        dstU[i] = p >> 8;
-        dstV[i] = p >> 16;
+        dstU[i] = (uint8_t)(p>> 8)<<6;
+        dstV[i] = (uint8_t)(p>>16)<<6;
     }
 }
 
-static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
-                          int width, uint32_t *unused)
+static void monowhite2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
 {
+    int16_t *dst = (int16_t *)_dst;
     int i, j;
     width = (width + 7) >> 3;
     for (i = 0; i < width; i++) {
         int d = ~src[i];
         for (j = 0; j < 8; j++)
-            dst[8 * i + j] = ((d >> (7 - j)) & 1) * 255;
+            dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+    }
+    if(width&7){
+        int d= ~src[i];
+        for (j = 0; j < (width&7); j++)
+            dst[8*i+j]= ((d>>(7-j))&1) * 16383;
     }
 }
 
-static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
-                          int width, uint32_t *unused)
+static void monoblack2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
 {
+    int16_t *dst = (int16_t *)_dst;
     int i, j;
     width = (width + 7) >> 3;
     for (i = 0; i < width; i++) {
         int d = src[i];
         for (j = 0; j < 8; j++)
-            dst[8 * i + j] = ((d >> (7 - j)) & 1) * 255;
+            dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+    }
+    if(width&7){
+        int d = src[i];
+        for (j = 0; j < (width&7); j++)
+            dst[8*i+j] = ((d>>(7-j))&1) * 16383;
     }
 }
 
-static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
+static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
                       uint32_t *unused)
 {
     int i;
@@ -374,7 +511,7 @@ static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
         dst[i] = src[2 * i];
 }
 
-static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                        const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -382,10 +519,10 @@ static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
         dstU[i] = src1[4 * i + 1];
         dstV[i] = src1[4 * i + 3];
     }
-    assert(src1 == src2);
+    av_assert1(src1 == src2);
 }
 
-static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width,
+static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2,  int width,
                        uint32_t *unused)
 {
     int i;
@@ -395,7 +532,7 @@ static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width,
         dst[i] = av_bswap16(src[i]);
 }
 
-static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
+static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
                         const uint8_t *_src2, int width, uint32_t *unused)
 {
     int i;
@@ -410,7 +547,7 @@ static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
 
 /* This is almost identical to the previous, end exists only because
  * yuy2ToY/UV)(dst, src + 1, ...) would have 100% unaligned accesses. */
-static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
+static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
                       uint32_t *unused)
 {
     int i;
@@ -418,7 +555,7 @@ static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
         dst[i] = src[2 * i + 1];
 }
 
-static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                        const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -426,7 +563,7 @@ static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
         dstU[i] = src1[4 * i + 0];
         dstV[i] = src1[4 * i + 2];
     }
-    assert(src1 == src2);
+    av_assert1(src1 == src2);
 }
 
 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
@@ -440,14 +577,14 @@ static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
 }
 
 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *src1, const uint8_t *src2,
+                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
                        int width, uint32_t *unused)
 {
     nvXXtoUV_c(dstU, dstV, src1, width);
 }
 
 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *src1, const uint8_t *src2,
+                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
                        int width, uint32_t *unused)
 {
     nvXXtoUV_c(dstV, dstU, src1, width);
@@ -455,114 +592,127 @@ static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
 
 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
 
-static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
+static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
                        int width, uint32_t *unused)
 {
+    int16_t *dst = (int16_t *)_dst;
     int i;
     for (i = 0; i < width; i++) {
         int b = src[i * 3 + 0];
         int g = src[i * 3 + 1];
         int r = src[i * 3 + 2];
 
-        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+        dst[i] = ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
     }
 }
 
-static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void bgr24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
                         const uint8_t *src2, int width, uint32_t *unused)
 {
+    int16_t *dstU = (int16_t *)_dstU;
+    int16_t *dstV = (int16_t *)_dstV;
     int i;
     for (i = 0; i < width; i++) {
         int b = src1[3 * i + 0];
         int g = src1[3 * i + 1];
         int r = src1[3 * i + 2];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+        dstU[i] = (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+        dstV[i] = (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
     }
-    assert(src1 == src2);
+    av_assert1(src1 == src2);
 }
 
-static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void bgr24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
                              const uint8_t *src2, int width, uint32_t *unused)
 {
+    int16_t *dstU = (int16_t *)_dstU;
+    int16_t *dstV = (int16_t *)_dstV;
     int i;
     for (i = 0; i < width; i++) {
         int b = src1[6 * i + 0] + src1[6 * i + 3];
         int g = src1[6 * i + 1] + src1[6 * i + 4];
         int r = src1[6 * i + 2] + src1[6 * i + 5];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+        dstU[i] = (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+        dstV[i] = (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
     }
-    assert(src1 == src2);
+    av_assert1(src1 == src2);
 }
 
-static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
+static void rgb24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
                        uint32_t *unused)
 {
+    int16_t *dst = (int16_t *)_dst;
     int i;
     for (i = 0; i < width; i++) {
         int r = src[i * 3 + 0];
         int g = src[i * 3 + 1];
         int b = src[i * 3 + 2];
 
-        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+        dst[i] = ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
     }
 }
 
-static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void rgb24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
                         const uint8_t *src2, int width, uint32_t *unused)
 {
+    int16_t *dstU = (int16_t *)_dstU;
+    int16_t *dstV = (int16_t *)_dstV;
     int i;
-    assert(src1 == src2);
+    av_assert1(src1 == src2);
     for (i = 0; i < width; i++) {
         int r = src1[3 * i + 0];
         int g = src1[3 * i + 1];
         int b = src1[3 * i + 2];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+        dstU[i] = (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+        dstV[i] = (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
     }
 }
 
-static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void rgb24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
                              const uint8_t *src2, int width, uint32_t *unused)
 {
+    int16_t *dstU = (int16_t *)_dstU;
+    int16_t *dstV = (int16_t *)_dstV;
     int i;
-    assert(src1 == src2);
+    av_assert1(src1 == src2);
     for (i = 0; i < width; i++) {
         int r = src1[6 * i + 0] + src1[6 * i + 3];
         int g = src1[6 * i + 1] + src1[6 * i + 4];
         int b = src1[6 * i + 2] + src1[6 * i + 5];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+        dstU[i] = (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+        dstV[i] = (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
     }
 }
 
-static void planar_rgb_to_y(uint8_t *dst, const uint8_t *src[4], int width)
+static void planar_rgb_to_y(uint8_t *_dst, const uint8_t *src[4], int width)
 {
+    uint16_t *dst = (uint16_t *)_dst;
     int i;
     for (i = 0; i < width; i++) {
         int g = src[0][i];
         int b = src[1][i];
         int r = src[2][i];
 
-        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+        dst[i] = (RY*r + GY*g + BY*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
     }
 }
 
-static void planar_rgb_to_uv(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4], int width)
+static void planar_rgb_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width)
 {
+    uint16_t *dstU = (uint16_t *)_dstU;
+    uint16_t *dstV = (uint16_t *)_dstV;
     int i;
     for (i = 0; i < width; i++) {
         int g = src[0][i];
         int b = src[1][i];
         int r = src[2][i];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+        dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+        dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
     }
 }
 
@@ -603,6 +753,26 @@ static void planar_rgb10be_to_y(uint8_t *dst, const uint8_t *src[4], int w)
     planar_rgb16_to_y(dst, src, w, 10, 1);
 }
 
+static void planar_rgb12le_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+    planar_rgb16_to_y(dst, src, w, 12, 0);
+}
+
+static void planar_rgb12be_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+    planar_rgb16_to_y(dst, src, w, 12, 1);
+}
+
+static void planar_rgb14le_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+    planar_rgb16_to_y(dst, src, w, 14, 0);
+}
+
+static void planar_rgb14be_to_y(uint8_t *dst, const uint8_t *src[4], int w)
+{
+    planar_rgb16_to_y(dst, src, w, 14, 1);
+}
+
 static void planar_rgb16le_to_y(uint8_t *dst, const uint8_t *src[4], int w)
 {
     planar_rgb16_to_y(dst, src, w, 16, 0);
@@ -656,6 +826,30 @@ static void planar_rgb10be_to_uv(uint8_t *dstU, uint8_t *dstV,
     planar_rgb16_to_uv(dstU, dstV, src, w, 10, 1);
 }
 
+static void planar_rgb12le_to_uv(uint8_t *dstU, uint8_t *dstV,
+                                 const uint8_t *src[4], int w)
+{
+    planar_rgb16_to_uv(dstU, dstV, src, w, 12, 0);
+}
+
+static void planar_rgb12be_to_uv(uint8_t *dstU, uint8_t *dstV,
+                                 const uint8_t *src[4], int w)
+{
+    planar_rgb16_to_uv(dstU, dstV, src, w, 12, 1);
+}
+
+static void planar_rgb14le_to_uv(uint8_t *dstU, uint8_t *dstV,
+                                 const uint8_t *src[4], int w)
+{
+    planar_rgb16_to_uv(dstU, dstV, src, w, 14, 0);
+}
+
+static void planar_rgb14be_to_uv(uint8_t *dstU, uint8_t *dstV,
+                                 const uint8_t *src[4], int w)
+{
+    planar_rgb16_to_uv(dstU, dstV, src, w, 14, 1);
+}
+
 static void planar_rgb16le_to_uv(uint8_t *dstU, uint8_t *dstV,
                                  const uint8_t *src[4], int w)
 {
@@ -699,6 +893,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_GBRP10LE:
         c->readChrPlanar = planar_rgb10le_to_uv;
         break;
+    case AV_PIX_FMT_GBRP12LE:
+        c->readChrPlanar = planar_rgb12le_to_uv;
+        break;
+    case AV_PIX_FMT_GBRP14LE:
+        c->readChrPlanar = planar_rgb14le_to_uv;
+        break;
     case AV_PIX_FMT_GBRP16LE:
         c->readChrPlanar = planar_rgb16le_to_uv;
         break;
@@ -708,6 +908,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_GBRP10BE:
         c->readChrPlanar = planar_rgb10be_to_uv;
         break;
+    case AV_PIX_FMT_GBRP12BE:
+        c->readChrPlanar = planar_rgb12be_to_uv;
+        break;
+    case AV_PIX_FMT_GBRP14BE:
+        c->readChrPlanar = planar_rgb14be_to_uv;
+        break;
     case AV_PIX_FMT_GBRP16BE:
         c->readChrPlanar = planar_rgb16be_to_uv;
         break;
@@ -721,8 +927,24 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_YUV422P10LE:
     case AV_PIX_FMT_YUV444P10LE:
     case AV_PIX_FMT_YUV420P10LE:
+    case AV_PIX_FMT_YUV422P12LE:
+    case AV_PIX_FMT_YUV444P12LE:
+    case AV_PIX_FMT_YUV420P12LE:
+    case AV_PIX_FMT_YUV422P14LE:
+    case AV_PIX_FMT_YUV444P14LE:
+    case AV_PIX_FMT_YUV420P14LE:
     case AV_PIX_FMT_YUV420P16LE:
     case AV_PIX_FMT_YUV422P16LE:
+
+    case AV_PIX_FMT_YUVA444P9LE:
+    case AV_PIX_FMT_YUVA422P9LE:
+    case AV_PIX_FMT_YUVA420P9LE:
+    case AV_PIX_FMT_YUVA444P10LE:
+    case AV_PIX_FMT_YUVA422P10LE:
+    case AV_PIX_FMT_YUVA420P10LE:
+    case AV_PIX_FMT_YUVA420P16LE:
+    case AV_PIX_FMT_YUVA422P16LE:
+    case AV_PIX_FMT_YUVA444P16LE:
     case AV_PIX_FMT_YUV444P16LE:
         c->chrToYV12 = bswap16UV_c;
         break;
@@ -733,15 +955,37 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_YUV444P10BE:
     case AV_PIX_FMT_YUV422P10BE:
     case AV_PIX_FMT_YUV420P10BE:
+    case AV_PIX_FMT_YUV444P12BE:
+    case AV_PIX_FMT_YUV422P12BE:
+    case AV_PIX_FMT_YUV420P12BE:
+    case AV_PIX_FMT_YUV444P14BE:
+    case AV_PIX_FMT_YUV422P14BE:
+    case AV_PIX_FMT_YUV420P14BE:
     case AV_PIX_FMT_YUV420P16BE:
     case AV_PIX_FMT_YUV422P16BE:
     case AV_PIX_FMT_YUV444P16BE:
+
+    case AV_PIX_FMT_YUVA444P9BE:
+    case AV_PIX_FMT_YUVA422P9BE:
+    case AV_PIX_FMT_YUVA420P9BE:
+    case AV_PIX_FMT_YUVA444P10BE:
+    case AV_PIX_FMT_YUVA422P10BE:
+    case AV_PIX_FMT_YUVA420P10BE:
+    case AV_PIX_FMT_YUVA420P16BE:
+    case AV_PIX_FMT_YUVA422P16BE:
+    case AV_PIX_FMT_YUVA444P16BE:
         c->chrToYV12 = bswap16UV_c;
         break;
 #endif
     }
     if (c->chrSrcHSubSample) {
         switch (srcFormat) {
+        case AV_PIX_FMT_RGBA64BE:
+            c->chrToYV12 = rgb64BEToUV_half_c;
+            break;
+        case AV_PIX_FMT_RGBA64LE:
+            c->chrToYV12 = rgb64LEToUV_half_c;
+            break;
         case AV_PIX_FMT_RGB48BE:
             c->chrToYV12 = rgb48BEToUV_half_c;
             break;
@@ -775,6 +1019,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case AV_PIX_FMT_BGR555BE:
             c->chrToYV12 = bgr15beToUV_half_c;
             break;
+        case AV_PIX_FMT_GBR24P  :
+            c->chrToYV12 = gbr24pToUV_half_c;
+            break;
         case AV_PIX_FMT_BGR444LE:
             c->chrToYV12 = bgr12leToUV_half_c;
             break;
@@ -811,6 +1058,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         }
     } else {
         switch (srcFormat) {
+        case AV_PIX_FMT_RGBA64BE:
+            c->chrToYV12 = rgb64BEToUV_c;
+            break;
+        case AV_PIX_FMT_RGBA64LE:
+            c->chrToYV12 = rgb64LEToUV_c;
+            break;
         case AV_PIX_FMT_RGB48BE:
             c->chrToYV12 = rgb48BEToUV_c;
             break;
@@ -889,6 +1142,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_GBRP10LE:
         c->readLumPlanar = planar_rgb10le_to_y;
         break;
+    case AV_PIX_FMT_GBRP12LE:
+        c->readLumPlanar = planar_rgb12le_to_y;
+        break;
+    case AV_PIX_FMT_GBRP14LE:
+        c->readLumPlanar = planar_rgb14le_to_y;
+        break;
     case AV_PIX_FMT_GBRP16LE:
         c->readLumPlanar = planar_rgb16le_to_y;
         break;
@@ -898,6 +1157,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_GBRP10BE:
         c->readLumPlanar = planar_rgb10be_to_y;
         break;
+    case AV_PIX_FMT_GBRP12BE:
+        c->readLumPlanar = planar_rgb12be_to_y;
+        break;
+    case AV_PIX_FMT_GBRP14BE:
+        c->readLumPlanar = planar_rgb14be_to_y;
+        break;
     case AV_PIX_FMT_GBRP16BE:
         c->readLumPlanar = planar_rgb16be_to_y;
         break;
@@ -911,9 +1176,25 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_YUV444P10LE:
     case AV_PIX_FMT_YUV422P10LE:
     case AV_PIX_FMT_YUV420P10LE:
+    case AV_PIX_FMT_YUV444P12LE:
+    case AV_PIX_FMT_YUV422P12LE:
+    case AV_PIX_FMT_YUV420P12LE:
+    case AV_PIX_FMT_YUV444P14LE:
+    case AV_PIX_FMT_YUV422P14LE:
+    case AV_PIX_FMT_YUV420P14LE:
     case AV_PIX_FMT_YUV420P16LE:
     case AV_PIX_FMT_YUV422P16LE:
     case AV_PIX_FMT_YUV444P16LE:
+
+    case AV_PIX_FMT_YUVA444P9LE:
+    case AV_PIX_FMT_YUVA422P9LE:
+    case AV_PIX_FMT_YUVA420P9LE:
+    case AV_PIX_FMT_YUVA444P10LE:
+    case AV_PIX_FMT_YUVA422P10LE:
+    case AV_PIX_FMT_YUVA420P10LE:
+    case AV_PIX_FMT_YUVA420P16LE:
+    case AV_PIX_FMT_YUVA422P16LE:
+    case AV_PIX_FMT_YUVA444P16LE:
     case AV_PIX_FMT_GRAY16LE:
         c->lumToYV12 = bswap16Y_c;
         break;
@@ -924,9 +1205,25 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_YUV444P10BE:
     case AV_PIX_FMT_YUV422P10BE:
     case AV_PIX_FMT_YUV420P10BE:
+    case AV_PIX_FMT_YUV444P12BE:
+    case AV_PIX_FMT_YUV422P12BE:
+    case AV_PIX_FMT_YUV420P12BE:
+    case AV_PIX_FMT_YUV444P14BE:
+    case AV_PIX_FMT_YUV422P14BE:
+    case AV_PIX_FMT_YUV420P14BE:
     case AV_PIX_FMT_YUV420P16BE:
     case AV_PIX_FMT_YUV422P16BE:
     case AV_PIX_FMT_YUV444P16BE:
+
+    case AV_PIX_FMT_YUVA444P9BE:
+    case AV_PIX_FMT_YUVA422P9BE:
+    case AV_PIX_FMT_YUVA420P9BE:
+    case AV_PIX_FMT_YUVA444P10BE:
+    case AV_PIX_FMT_YUVA422P10BE:
+    case AV_PIX_FMT_YUVA420P10BE:
+    case AV_PIX_FMT_YUVA420P16BE:
+    case AV_PIX_FMT_YUVA422P16BE:
+    case AV_PIX_FMT_YUVA444P16BE:
     case AV_PIX_FMT_GRAY16BE:
         c->lumToYV12 = bswap16Y_c;
         break;
@@ -1017,9 +1314,21 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case AV_PIX_FMT_BGR48LE:
         c->lumToYV12 = bgr48LEToY_c;
         break;
+    case AV_PIX_FMT_RGBA64BE:
+        c->lumToYV12 = rgb64BEToY_c;
+        break;
+    case AV_PIX_FMT_RGBA64LE:
+        c->lumToYV12 = rgb64LEToY_c;
+        break;
     }
     if (c->alpPixBuf) {
+        if (is16BPS(srcFormat) || isNBPS(srcFormat)) {
+            if (HAVE_BIGENDIAN == !isBE(srcFormat))
+                c->alpToYV12 = bswap16Y_c;
+        }
         switch (srcFormat) {
+        case AV_PIX_FMT_RGBA64LE:
+        case AV_PIX_FMT_RGBA64BE:  c->alpToYV12 = rgba64ToA_c; break;
         case AV_PIX_FMT_BGRA:
         case AV_PIX_FMT_RGBA:
             c->alpToYV12 = rgbaToA_c;
@@ -1031,6 +1340,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case AV_PIX_FMT_Y400A:
             c->alpToYV12 = uyvyToY_c;
             break;
+        case AV_PIX_FMT_PAL8 :
+            c->alpToYV12 = palToA_c;
+            break;
         }
     }
 }
diff --git a/libswscale/options.c b/libswscale/options.c
index daf013c741..9cd4fc0607 100644
--- a/libswscale/options.c
+++ b/libswscale/options.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,7 +66,12 @@ static const AVOption options[] = {
     { NULL }
 };
 
-const AVClass sws_context_class = { "SWScaler", sws_context_to_name, options };
+const AVClass sws_context_class = {
+    .class_name = "SWScaler",
+    .item_name  = sws_context_to_name,
+    .option     = options,
+    .category   = AV_CLASS_CATEGORY_SWSCALER,
+};
 
 const AVClass *sws_get_class(void)
 {
diff --git a/libswscale/output.c b/libswscale/output.c
index 4953290018..8306298d64 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,7 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/avutil.h"
+#include "libavutil/avassert.h"
 #include "libavutil/bswap.h"
 #include "libavutil/cpu.h"
 #include "libavutil/intreadwrite.h"
@@ -36,24 +37,27 @@
 #include "swscale.h"
 #include "swscale_internal.h"
 
-DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
+DECLARE_ALIGNED(8, const uint8_t, dither_2x2_4)[][8]={
 {  1,   3,   1,   3,   1,   3,   1,   3, },
 {  2,   0,   2,   0,   2,   0,   2,   0, },
+{  1,   3,   1,   3,   1,   3,   1,   3, },
 };
 
-DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
+DECLARE_ALIGNED(8, const uint8_t, dither_2x2_8)[][8]={
 {  6,   2,   6,   2,   6,   2,   6,   2, },
 {  0,   4,   0,   4,   0,   4,   0,   4, },
+{  6,   2,   6,   2,   6,   2,   6,   2, },
 };
 
-DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
+DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[][8]={
 {  8,   4,  11,   7,   8,   4,  11,   7, },
 {  2,  14,   1,  13,   2,  14,   1,  13, },
 { 10,   6,   9,   5,  10,   6,   9,   5, },
 {  0,  12,   3,  15,   0,  12,   3,  15, },
+{  8,   4,  11,   7,   8,   4,  11,   7, },
 };
 
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[][8]={
 { 17,   9,  23,  15,  16,   8,  22,  14, },
 {  5,  29,   3,  27,   4,  28,   2,  26, },
 { 21,  13,  19,  11,  20,  12,  18,  10, },
@@ -62,9 +66,10 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 {  4,  28,   2,  26,   5,  29,   3,  27, },
 { 20,  12,  18,  10,  21,  13,  19,  11, },
 {  1,  25,   7,  31,   0,  24,   6,  30, },
+{ 17,   9,  23,  15,  16,   8,  22,  14, },
 };
 
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[][8]={
 {  0,  55,  14,  68,   3,  58,  17,  72, },
 { 37,  18,  50,  32,  40,  22,  54,  35, },
 {  9,  64,   5,  59,  13,  67,   8,  63, },
@@ -73,10 +78,11 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 { 39,  21,  52,  34,  38,  19,  51,  33, },
 { 11,  66,   7,  62,  10,  65,   6,  60, },
 { 48,  30,  43,  25,  47,  29,  42,  24, },
+{  0,  55,  14,  68,   3,  58,  17,  72, },
 };
 
 #if 1
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[][8]={
 {117,  62, 158, 103, 113,  58, 155, 100, },
 { 34, 199,  21, 186,  31, 196,  17, 182, },
 {144,  89, 131,  76, 141,  86, 127,  72, },
@@ -85,10 +91,11 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 { 28, 193,  14, 179,  38, 203,  24, 189, },
 {138,  83, 124,  69, 148,  93, 134,  79, },
 {  7, 172,  48, 213,   3, 168,  45, 210, },
+{117,  62, 158, 103, 113,  58, 155, 100, },
 };
 #elif 1
 // tries to correct a gamma of 1.5
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[][8]={
 {  0, 143,  18, 200,   2, 156,  25, 215, },
 { 78,  28, 125,  64,  89,  36, 138,  74, },
 { 10, 180,   3, 161,  16, 195,   8, 175, },
@@ -97,10 +104,11 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 { 85,  33, 134,  71,  81,  30, 130,  67, },
 { 14, 190,   6, 171,  12, 185,   5, 166, },
 {117,  57, 101,  44, 113,  54,  97,  41, },
+{  0, 143,  18, 200,   2, 156,  25, 215, },
 };
 #elif 1
 // tries to correct a gamma of 2.0
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[][8]={
 {  0, 124,   8, 193,   0, 140,  12, 213, },
 { 55,  14, 104,  42,  66,  19, 119,  52, },
 {  3, 168,   1, 145,   6, 187,   3, 162, },
@@ -109,10 +117,11 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 { 62,  17, 114,  48,  58,  16, 109,  45, },
 {  5, 181,   2, 157,   4, 175,   1, 151, },
 { 95,  36,  78,  26,  90,  34,  74,  24, },
+{  0, 124,   8, 193,   0, 140,  12, 213, },
 };
 #else
 // tries to correct a gamma of 2.5
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[][8]={
 {  0, 107,   3, 187,   0, 125,   6, 212, },
 { 39,   7,  86,  28,  49,  11, 102,  36, },
 {  1, 158,   0, 131,   3, 180,   1, 151, },
@@ -121,6 +130,7 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 { 45,   9,  96,  33,  42,   8,  91,  30, },
 {  2, 172,   1, 144,   2, 165,   0, 137, },
 { 77,  23,  60,  15,  72,  21,  56,  14, },
+{  0, 107,   3, 187,   0, 125,   6, 212, },
 };
 #endif
 
@@ -136,7 +146,8 @@ yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
                          int big_endian, int output_bits)
 {
     int i;
-    int shift = 19 - output_bits;
+    int shift = 3;
+    av_assert0(output_bits == 16);
 
     for (i = 0; i < dstW; i++) {
         int val = src[i] + (1 << (shift - 1));
@@ -150,10 +161,11 @@ yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
                          int big_endian, int output_bits)
 {
     int i;
-    int shift = 15 + 16 - output_bits;
+    int shift = 15;
+    av_assert0(output_bits == 16);
 
     for (i = 0; i < dstW; i++) {
-        int val = 1 << (30-output_bits);
+        int val = 1 << (shift - 1);
         int j;
 
         /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
@@ -163,7 +175,7 @@ yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
          * reasonable filterSize), and re-add that at the end. */
         val -= 0x40000000;
         for (j = 0; j < filterSize; j++)
-            val += src[j][i] * filter[j];
+            val += src[j][i] * (unsigned)filter[j];
 
         output_pixel(&dest[i], val, 0x8000, int);
     }
@@ -200,7 +212,7 @@ yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
     int shift = 11 + 16 - output_bits;
 
     for (i = 0; i < dstW; i++) {
-        int val = 1 << (26-output_bits);
+        int val = 1 << (shift - 1);
         int j;
 
         for (j = 0; j < filterSize; j++)
@@ -393,14 +405,14 @@ yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
     for (i = 0; i < dstW; i += 8) {
         int acc = 0;
 
-        accumulate_bit(acc, (buf0[i + 0] >> 7) + d128[0]);
-        accumulate_bit(acc, (buf0[i + 1] >> 7) + d128[1]);
-        accumulate_bit(acc, (buf0[i + 2] >> 7) + d128[2]);
-        accumulate_bit(acc, (buf0[i + 3] >> 7) + d128[3]);
-        accumulate_bit(acc, (buf0[i + 4] >> 7) + d128[4]);
-        accumulate_bit(acc, (buf0[i + 5] >> 7) + d128[5]);
-        accumulate_bit(acc, (buf0[i + 6] >> 7) + d128[6]);
-        accumulate_bit(acc, (buf0[i + 7] >> 7) + d128[7]);
+        accumulate_bit(acc, ((buf0[i + 0] + 64) >> 7) + d128[0]);
+        accumulate_bit(acc, ((buf0[i + 1] + 64) >> 7) + d128[1]);
+        accumulate_bit(acc, ((buf0[i + 2] + 64) >> 7) + d128[2]);
+        accumulate_bit(acc, ((buf0[i + 3] + 64) >> 7) + d128[3]);
+        accumulate_bit(acc, ((buf0[i + 4] + 64) >> 7) + d128[4]);
+        accumulate_bit(acc, ((buf0[i + 5] + 64) >> 7) + d128[5]);
+        accumulate_bit(acc, ((buf0[i + 6] + 64) >> 7) + d128[6]);
+        accumulate_bit(acc, ((buf0[i + 7] + 64) >> 7) + d128[7]);
 
         output_pixel(*dest++, acc);
     }
@@ -516,10 +528,12 @@ yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 
-        Y1 = av_clip_uint8(Y1);
-        Y2 = av_clip_uint8(Y2);
-        U  = av_clip_uint8(U);
-        V  = av_clip_uint8(V);
+        if ((Y1 | Y2 | U | V) & 0x100) {
+            Y1 = av_clip_uint8(Y1);
+            Y2 = av_clip_uint8(Y2);
+            U  = av_clip_uint8(U);
+            V  = av_clip_uint8(V);
+        }
 
         output_pixels(i * 4, Y1, U, Y2, V);
     }
@@ -536,10 +550,17 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 
     if (uvalpha < 2048) {
         for (i = 0; i < ((dstW + 1) >> 1); i++) {
-            int Y1 = buf0[i * 2]     >> 7;
-            int Y2 = buf0[i * 2 + 1] >> 7;
-            int U  = ubuf0[i]        >> 7;
-            int V  = vbuf0[i]        >> 7;
+            int Y1 = (buf0[i * 2    ]+64) >> 7;
+            int Y2 = (buf0[i * 2 + 1]+64) >> 7;
+            int U  = (ubuf0[i]       +64) >> 7;
+            int V  = (vbuf0[i]       +64) >> 7;
+
+            if ((Y1 | Y2 | U | V) & 0x100) {
+                Y1 = av_clip_uint8(Y1);
+                Y2 = av_clip_uint8(Y2);
+                U  = av_clip_uint8(U);
+                V  = av_clip_uint8(V);
+            }
 
             Y1 = av_clip_uint8(Y1);
             Y2 = av_clip_uint8(Y2);
@@ -551,10 +572,17 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
     } else {
         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
         for (i = 0; i < ((dstW + 1) >> 1); i++) {
-            int Y1 =  buf0[i * 2]          >> 7;
-            int Y2 =  buf0[i * 2 + 1]      >> 7;
-            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
-            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
+            int Y1 = (buf0[i * 2    ]    + 64) >> 7;
+            int Y2 = (buf0[i * 2 + 1]    + 64) >> 7;
+            int U  = (ubuf0[i] + ubuf1[i]+128) >> 8;
+            int V  = (vbuf0[i] + vbuf1[i]+128) >> 8;
+
+            if ((Y1 | Y2 | U | V) & 0x100) {
+                Y1 = av_clip_uint8(Y1);
+                Y2 = av_clip_uint8(Y2);
+                U  = av_clip_uint8(U);
+                V  = av_clip_uint8(V);
+            }
 
             Y1 = av_clip_uint8(Y1);
             Y2 = av_clip_uint8(Y2);
@@ -599,12 +627,12 @@ yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
         int R, G, B;
 
         for (j = 0; j < lumFilterSize; j++) {
-            Y1 += lumSrc[j][i * 2]     * lumFilter[j];
-            Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
+            Y1 += lumSrc[j][i * 2]     * (unsigned)lumFilter[j];
+            Y2 += lumSrc[j][i * 2 + 1] * (unsigned)lumFilter[j];
         }
-        for (j = 0; j < chrFilterSize; j++) {
-            U += chrUSrc[j][i] * chrFilter[j];
-            V += chrVSrc[j][i] * chrFilter[j];
+        for (j = 0; j < chrFilterSize; j++) {;
+            U += chrUSrc[j][i] * (unsigned)chrFilter[j];
+            V += chrVSrc[j][i] * (unsigned)chrFilter[j];
         }
 
         // 8bit: 12+15=27; 16-bit: 12+19=31
@@ -812,7 +840,7 @@ YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, AV_PIX_FMT_BGR48LE)
  * correct RGB values into the destination buffer.
  */
 static av_always_inline void
-yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
+yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
               unsigned A1, unsigned A2,
               const void *_r, const void *_g, const void *_b, int y,
               enum AVPixelFormat target, int hasAlpha)
@@ -848,6 +876,7 @@ yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
 
 #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
 #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
+
         dest[i * 6 + 0] = r_b[Y1];
         dest[i * 6 + 1] =   g[Y1];
         dest[i * 6 + 2] = b_r[Y1];
@@ -953,12 +982,6 @@ yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
         Y2 >>= 19;
         U  >>= 19;
         V  >>= 19;
-        if ((Y1 | Y2 | U | V) & 0x100) {
-            Y1 = av_clip_uint8(Y1);
-            Y2 = av_clip_uint8(Y2);
-            U  = av_clip_uint8(U);
-            V  = av_clip_uint8(V);
-        }
         if (hasAlpha) {
             A1 = 1 << 18;
             A2 = 1 << 18;
@@ -974,10 +997,9 @@ yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
             }
         }
 
-        /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
-        r =  c->table_rV[V];
-        g = (c->table_gU[U] + c->table_gV[V]);
-        b =  c->table_bU[U];
+        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM];
+        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
+        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
         yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
                       r, g, b, y, target, hasAlpha);
@@ -1006,16 +1028,9 @@ yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
         int A1, A2;
-        const void *r, *g, *b;
-
-        Y1 = av_clip_uint8(Y1);
-        Y2 = av_clip_uint8(Y2);
-        U  = av_clip_uint8(U);
-        V  = av_clip_uint8(V);
-
-        r =  c->table_rV[V];
-        g = (c->table_gU[U] + c->table_gV[V]);
-        b =  c->table_bU[U];
+        const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+                   *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+                   *b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
         if (hasAlpha) {
             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
@@ -1041,25 +1056,18 @@ yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
 
     if (uvalpha < 2048) {
         for (i = 0; i < ((dstW + 1) >> 1); i++) {
-            int Y1 = buf0[i * 2]     >> 7;
-            int Y2 = buf0[i * 2 + 1] >> 7;
-            int U  = ubuf0[i]        >> 7;
-            int V  = vbuf0[i]        >> 7;
+            int Y1 = (buf0[i * 2    ] + 64) >> 7;
+            int Y2 = (buf0[i * 2 + 1] + 64) >> 7;
+            int U  = (ubuf0[i]        + 64) >> 7;
+            int V  = (vbuf0[i]        + 64) >> 7;
             int A1, A2;
-            const void *r, *g, *b;
-
-            Y1 = av_clip_uint8(Y1);
-            Y2 = av_clip_uint8(Y2);
-            U  = av_clip_uint8(U);
-            V  = av_clip_uint8(V);
-
-            r =  c->table_rV[V];
-            g = (c->table_gU[U] + c->table_gV[V]);
-            b =  c->table_bU[U];
+            const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+                       *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+                       *b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
             if (hasAlpha) {
-                A1 = abuf0[i * 2    ] >> 7;
-                A2 = abuf0[i * 2 + 1] >> 7;
+                A1 = abuf0[i * 2    ] * 255 + 16384 >> 15;
+                A2 = abuf0[i * 2 + 1] * 255 + 16384 >> 15;
                 A1 = av_clip_uint8(A1);
                 A2 = av_clip_uint8(A2);
             }
@@ -1070,25 +1078,18 @@ yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
     } else {
         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
         for (i = 0; i < ((dstW + 1) >> 1); i++) {
-            int Y1 =  buf0[i * 2]          >> 7;
-            int Y2 =  buf0[i * 2 + 1]      >> 7;
-            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
-            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
+            int Y1 = (buf0[i * 2    ]     +  64) >> 7;
+            int Y2 = (buf0[i * 2 + 1]     +  64) >> 7;
+            int U  = (ubuf0[i] + ubuf1[i] + 128) >> 8;
+            int V  = (vbuf0[i] + vbuf1[i] + 128) >> 8;
             int A1, A2;
-            const void *r, *g, *b;
-
-            Y1 = av_clip_uint8(Y1);
-            Y2 = av_clip_uint8(Y2);
-            U  = av_clip_uint8(U);
-            V  = av_clip_uint8(V);
-
-            r =  c->table_rV[V];
-            g = (c->table_gU[U] + c->table_gV[V]);
-            b =  c->table_bU[U];
+            const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+                       *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+                       *b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
             if (hasAlpha) {
-                A1 = abuf0[i * 2    ] >> 7;
-                A2 = abuf0[i * 2 + 1] >> 7;
+                A1 = (abuf0[i * 2    ] + 64) >> 7;
+                A2 = (abuf0[i * 2 + 1] + 64) >> 7;
                 A1 = av_clip_uint8(A1);
                 A2 = av_clip_uint8(A2);
             }
@@ -1164,9 +1165,9 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
 
     for (i = 0; i < dstW; i++) {
         int j;
-        int Y = 0;
-        int U = -128 << 19;
-        int V = -128 << 19;
+        int Y = 1<<9;
+        int U = (1<<9)-(128 << 19);
+        int V = (1<<9)-(128 << 19);
         int R, G, B, A;
 
         for (j = 0; j < lumFilterSize; j++) {
@@ -1180,7 +1181,7 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
         U >>= 10;
         V >>= 10;
         if (hasAlpha) {
-            A = 1 << 21;
+            A = 1 << 18;
             for (j = 0; j < lumFilterSize; j++) {
                 A += alpSrc[j][i] * lumFilter[j];
             }
@@ -1223,7 +1224,6 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
             dest[1] = B >> 22;
             dest[2] = G >> 22;
             dest[3] = R >> 22;
-            dest += 4;
             break;
         case AV_PIX_FMT_BGR24:
             dest[0] = B >> 22;
@@ -1355,7 +1355,10 @@ av_cold void ff_sws_init_output_funcs(SwsContext *c,
             *yuv2packedX = yuv2bgr24_full_X_c;
             break;
         }
+        if(!*yuv2packedX)
+            goto YUV_PACKED;
     } else {
+        YUV_PACKED:
         switch (dstFormat) {
         case AV_PIX_FMT_RGB48LE:
             *yuv2packed1 = yuv2rgb48le_1_c;
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index b0e25d041a..9ca2868cc8 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  * based on the equivalent C code in swscale.c
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -292,7 +292,7 @@ av_cold void ff_sws_init_swScale_altivec(SwsContext *c)
     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
         return;
 
-    if (c->srcBpc == 8 && c->dstBpc <= 10) {
+    if (c->srcBpc == 8 && c->dstBpc <= 14) {
         c->hyScale = c->hcScale = hScale_altivec_real;
     }
     if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
diff --git a/libswscale/ppc/yuv2rgb_altivec.c b/libswscale/ppc/yuv2rgb_altivec.c
index 94e87fe62f..a8501d9a10 100644
--- a/libswscale/ppc/yuv2rgb_altivec.c
+++ b/libswscale/ppc/yuv2rgb_altivec.c
@@ -3,20 +3,20 @@
  *
  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -97,6 +97,7 @@
 #include "libswscale/swscale_internal.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
 #include "yuv2rgb_altivec.h"
 
 #undef PROFILE_THE_BEAST
@@ -317,12 +318,7 @@ static int altivec_ ## name(SwsContext *c, const unsigned char **in,          \
     const ubyte *ui  = in[1];                                                 \
     const ubyte *vi  = in[2];                                                 \
                                                                               \
-    vector unsigned char *oute =                                              \
-        (vector unsigned char *)                                              \
-            (oplanes[0] + srcSliceY * outstrides[0]);                         \
-    vector unsigned char *outo =                                              \
-        (vector unsigned char *)                                              \
-            (oplanes[0] + srcSliceY * outstrides[0] + outstrides[0]);         \
+    vector unsigned char *oute, *outo;                                        \
                                                                               \
     /* loop moves y{1, 2}i by w */                                            \
     instrides_scl[0] = instrides[0] * 2 - w;                                  \
@@ -332,6 +328,9 @@ static int altivec_ ## name(SwsContext *c, const unsigned char **in,          \
     instrides_scl[2] = instrides[2] - w / 2;                                  \
                                                                               \
     for (i = 0; i < h / 2; i++) {                                             \
+        oute = (vector unsigned char *)(oplanes[0] + outstrides[0] *          \
+                                        (srcSliceY + i * 2));                 \
+        outo = oute + (outstrides[0] >> 4);                                   \
         vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0);       \
         vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1);       \
                                                                               \
@@ -429,9 +428,6 @@ static int altivec_ ## name(SwsContext *c, const unsigned char **in,          \
             vi  += 8;                                                         \
         }                                                                     \
                                                                               \
-        outo += (outstrides[0]) >> 4;                                         \
-        oute += (outstrides[0]) >> 4;                                         \
-                                                                              \
         ui  += instrides_scl[1];                                              \
         vi  += instrides_scl[2];                                              \
         y1i += instrides_scl[0];                                              \
@@ -736,7 +732,7 @@ static av_always_inline void ff_yuv2packedX_altivec(SwsContext *c,
             if (!printed_error_message) {
                 av_log(c, AV_LOG_ERROR,
                        "altivec_yuv2packedX doesn't support %s output\n",
-                       sws_format_name(c->dstFormat));
+                       av_get_pix_fmt_name(c->dstFormat));
                 printed_error_message = 1;
             }
             return;
@@ -824,7 +820,7 @@ static av_always_inline void ff_yuv2packedX_altivec(SwsContext *c,
             /* Unreachable, I think. */
             av_log(c, AV_LOG_ERROR,
                    "altivec_yuv2packedX doesn't support %s output\n",
-                   sws_format_name(c->dstFormat));
+                   av_get_pix_fmt_name(c->dstFormat));
             return;
         }
 
diff --git a/libswscale/ppc/yuv2rgb_altivec.h b/libswscale/ppc/yuv2rgb_altivec.h
index 2c5e7ed876..aa52a4743e 100644
--- a/libswscale/ppc/yuv2rgb_altivec.h
+++ b/libswscale/ppc/yuv2rgb_altivec.h
@@ -4,20 +4,20 @@
  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  * based on the equivalent C code in swscale.c
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/ppc/yuv2yuv_altivec.c b/libswscale/ppc/yuv2yuv_altivec.c
index e68cccf6a7..60d50a7baa 100644
--- a/libswscale/ppc/yuv2yuv_altivec.c
+++ b/libswscale/ppc/yuv2yuv_altivec.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  * based on the equivalent C code in swscale.c
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index c261fe6ec1..1233a1d3f4 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -6,20 +6,20 @@
  * Written by Nick Kurshev.
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -181,13 +181,13 @@ void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
         register uint16_t bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0xF800) >> 8;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
 #else
-        *d++ = (bgr & 0xF800) >> 8;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
         *d++ = 255;
 #endif
     }
@@ -220,9 +220,9 @@ void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size)
 
     while (s < end) {
         register uint16_t bgr = *s++;
-        *d++ = (bgr & 0xF800) >> 8;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
     }
 }
 
@@ -256,13 +256,13 @@ void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
         register uint16_t bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x7C00) >> 7;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
 #else
-        *d++ = (bgr & 0x7C00) >> 7;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
         *d++ = 255;
 #endif
     }
@@ -276,9 +276,9 @@ void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size)
 
     while (s < end) {
         register uint16_t bgr = *s++;
-        *d++ = (bgr & 0x7C00) >> 7;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
     }
 }
 
@@ -315,18 +315,6 @@ void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size)
     }
 }
 
-void bgr8torgb8(const uint8_t *src, uint8_t *dst, int src_size)
-{
-    int i, num_pixels = src_size;
-
-    for (i = 0; i < num_pixels; i++) {
-        register uint8_t rgb = src[i];
-        unsigned r           = (rgb & 0x07);
-        unsigned g           = (rgb & 0x38) >> 3;
-        unsigned b           = (rgb & 0xC0) >> 6;
-        dst[i]               = ((b << 1) & 0x07) | ((g & 0x07) << 3) | ((r & 0x03) << 6);
-    }
-}
 
 #define DEFINE_SHUFFLE_BYTES(a, b, c, d)                                \
 void shuffle_bytes_ ## a ## b ## c ## d(const uint8_t *src,             \
@@ -346,3 +334,57 @@ DEFINE_SHUFFLE_BYTES(0, 3, 2, 1)
 DEFINE_SHUFFLE_BYTES(1, 2, 3, 0)
 DEFINE_SHUFFLE_BYTES(3, 0, 1, 2)
 DEFINE_SHUFFLE_BYTES(3, 2, 1, 0)
+
+#define DEFINE_RGB48TOBGR48(need_bswap, swap)                           \
+void rgb48tobgr48_ ## need_bswap(const uint8_t *src,                    \
+                                 uint8_t *dst, int src_size)            \
+{                                                                       \
+    uint16_t *d = (uint16_t *)dst;                                      \
+    uint16_t *s = (uint16_t *)src;                                      \
+    int i, num_pixels = src_size >> 1;                                  \
+                                                                        \
+    for (i = 0; i < num_pixels; i += 3) {                               \
+        d[i    ] = swap ? av_bswap16(s[i + 2]) : s[i + 2];              \
+        d[i + 1] = swap ? av_bswap16(s[i + 1]) : s[i + 1];              \
+        d[i + 2] = swap ? av_bswap16(s[i    ]) : s[i    ];              \
+    }                                                                   \
+}
+
+DEFINE_RGB48TOBGR48(nobswap, 0)
+DEFINE_RGB48TOBGR48(bswap, 1)
+
+#define DEFINE_RGB64TOBGR48(need_bswap, swap)                           \
+void rgb64tobgr48_ ## need_bswap(const uint8_t *src,                    \
+                                 uint8_t *dst, int src_size)            \
+{                                                                       \
+    uint16_t *d = (uint16_t *)dst;                                      \
+    uint16_t *s = (uint16_t *)src;                                      \
+    int i, num_pixels = src_size >> 3;                                  \
+                                                                        \
+    for (i = 0; i < num_pixels; i++) {                                  \
+        d[3 * i    ] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2];  \
+        d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1];  \
+        d[3 * i + 2] = swap ? av_bswap16(s[4 * i    ]) : s[4 * i    ];  \
+    }                                                                   \
+}
+
+DEFINE_RGB64TOBGR48(nobswap, 0)
+DEFINE_RGB64TOBGR48(bswap, 1)
+
+#define DEFINE_RGB64TO48(need_bswap, swap)                              \
+void rgb64to48_ ## need_bswap(const uint8_t *src,                       \
+                              uint8_t *dst, int src_size)               \
+{                                                                       \
+    uint16_t *d = (uint16_t *)dst;                                      \
+    uint16_t *s = (uint16_t *)src;                                      \
+    int i, num_pixels = src_size >> 3;                                  \
+                                                                        \
+    for (i = 0; i < num_pixels; i++) {                                  \
+        d[3 * i    ] = swap ? av_bswap16(s[4 * i    ]) : s[4 * i    ];  \
+        d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1];  \
+        d[3 * i + 2] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2];  \
+    }                                                                   \
+}
+
+DEFINE_RGB64TO48(nobswap, 0)
+DEFINE_RGB64TO48(bswap, 1)
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 42f468fe21..e37f0fbbbe 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -6,20 +6,20 @@
  *  Written by Nick Kurshev.
  *  YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,6 +52,12 @@ extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
 
 extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
 
+void rgb64tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
+void   rgb64tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb48tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
+void   rgb48tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
+void    rgb64to48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
+void      rgb64to48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
 void    rgb24to32(const uint8_t *src, uint8_t *dst, int src_size);
 void    rgb32to24(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
@@ -64,7 +70,6 @@ void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size);
 void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
-void   bgr8torgb8(const uint8_t *src, uint8_t *dst, int src_size);
 
 void shuffle_bytes_0321(const uint8_t *src, uint8_t *dst, int src_size);
 void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size);
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index 3785ef9f07..8753594aaf 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -7,20 +7,20 @@
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  * lot of big-endian byte order fixes by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -238,27 +238,6 @@ static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, int src_size)
     }
 }
 
-/*
- * I use less accurate approximation here by simply left-shifting the input
- * value and filling the low order bits with zeroes. This method improves PNG
- * compression but this scheme cannot reproduce white exactly, since it does
- * not generate an all-ones maximum value; the net effect is to darken the
- * image slightly.
- *
- * The better method should be "left bit replication":
- *
- *  4 3 2 1 0
- *  ---------
- *  1 1 0 1 1
- *
- *  7 6 5 4 3  2 1 0
- *  ----------------
- *  1 1 0 1 1  1 1 0
- *  |=======|  |===|
- *      |      leftmost bits repeated to fill open bits
- *      |
- *  original bits
- */
 static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst,
                                   int src_size)
 {
@@ -268,9 +247,9 @@ static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst,
 
     while (s < end) {
         register uint16_t bgr = *s++;
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x7C00) >> 7;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
     }
 }
 
@@ -283,9 +262,9 @@ static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst,
 
     while (s < end) {
         register uint16_t bgr = *s++;
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0xF800) >> 8;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
     }
 }
 
@@ -299,13 +278,13 @@ static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, int src_size)
         register uint16_t bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr & 0x7C00) >> 7;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
 #else
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x7C00) >> 7;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
         *d++ = 255;
 #endif
     }
@@ -321,13 +300,13 @@ static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, int src_size)
         register uint16_t bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr & 0xF800) >> 8;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
 #else
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0xF800) >> 8;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
         *d++ = 255;
 #endif
     }
@@ -374,9 +353,9 @@ static inline void yuvPlanartoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
         for (i = 0; i < chromWidth; i += 2) {
             uint64_t k = yc[0] + (uc[0] << 8) +
-                         (yc[1] << 16) + (vc[0] << 24);
+                         (yc[1] << 16) + (unsigned)(vc[0] << 24);
             uint64_t l = yc[2] + (uc[1] << 8) +
-                         (yc[3] << 16) + (vc[1] << 24);
+                         (yc[3] << 16) + (unsigned)(vc[1] << 24);
             *ldst++ = k + (l << 32);
             yc     += 4;
             uc     += 2;
@@ -438,9 +417,9 @@ static inline void yuvPlanartouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
         for (i = 0; i < chromWidth; i += 2) {
             uint64_t k = uc[0] + (yc[0] << 8) +
-                         (vc[0] << 16) + (yc[1] << 24);
+                         (vc[0] << 16) + (unsigned)(yc[1] << 24);
             uint64_t l = uc[1] + (yc[2] << 8) +
-                         (vc[1] << 16) + (yc[3] << 24);
+                         (vc[1] << 16) + (unsigned)(yc[3] << 24);
             *ldst++ = k + (l << 32);
             yc     += 4;
             uc     += 2;
@@ -665,6 +644,9 @@ void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
         ydst += lumStride;
         src  += srcStride;
 
+        if (y+1 == height)
+            break;
+
         for (i = 0; i < chromWidth; i++) {
             unsigned int b = src[6 * i + 0];
             unsigned int g = src[6 * i + 1];
diff --git a/libswscale/sparc/yuv2rgb_vis.c b/libswscale/sparc/yuv2rgb_vis.c
index cd8a8b0b9c..ed008377ad 100644
--- a/libswscale/sparc/yuv2rgb_vis.c
+++ b/libswscale/sparc/yuv2rgb_vis.c
@@ -2,20 +2,20 @@
  * VIS optimized software YUV to RGB converter
  * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/swscale-test.c b/libswscale/swscale-test.c
index 12fa9edd50..40dec546b0 100644
--- a/libswscale/swscale-test.c
+++ b/libswscale/swscale-test.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -90,7 +90,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
     static int srcStride[4];
     uint8_t *dst[4] = { 0 };
     uint8_t *out[4] = { 0 };
-    int dstStride[4];
+    int dstStride[4] = {0};
     int i;
     uint64_t ssdY, ssdU = 0, ssdV = 0, ssdA = 0;
     struct SwsContext *dstContext = NULL, *outContext = NULL;
@@ -106,6 +106,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
 
         av_image_fill_linesizes(srcStride, srcFormat, srcW);
         for (p = 0; p < 4; p++) {
+            srcStride[p] = FFALIGN(srcStride[p], 16);
             if (srcStride[p])
                 src[p] = av_mallocz(srcStride[p] * srcH + 16);
             if (srcStride[p] && !src[p]) {
@@ -139,6 +140,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
          * allocated with av_malloc). */
         /* An extra 16 bytes is being allocated because some scalers may write
          * out of bounds. */
+        dstStride[i] = FFALIGN(dstStride[i], 16);
         if (dstStride[i])
             dst[i] = av_mallocz(dstStride[i] * dstH + 16);
         if (dstStride[i] && !dst[i]) {
@@ -177,6 +179,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
         ssdA = r->ssdA;
     } else {
         for (i = 0; i < 4; i++) {
+            refStride[i] = FFALIGN(refStride[i], 16);
             if (refStride[i])
                 out[i] = av_mallocz(refStride[i] * h);
             if (refStride[i] && !out[i]) {
@@ -313,7 +316,8 @@ static int fileTest(uint8_t *ref[4], int refStride[4], int w, int h, FILE *fp,
         srcFormat = av_get_pix_fmt(srcStr);
         dstFormat = av_get_pix_fmt(dstStr);
 
-        if (srcFormat == AV_PIX_FMT_NONE || dstFormat == AV_PIX_FMT_NONE) {
+        if (srcFormat == AV_PIX_FMT_NONE || dstFormat == AV_PIX_FMT_NONE ||
+            srcW > 8192U || srcH > 8192U || dstW > 8192U || dstH > 8192U) {
             fprintf(stderr, "malformed input file\n");
             return -1;
         }
@@ -352,34 +356,20 @@ int main(int argc, char **argv)
     AVLFG rand;
     int res = -1;
     int i;
+    FILE *fp = NULL;
 
     if (!rgb_data || !data)
         return -1;
 
-    sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
-                         AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
-
-    av_lfg_init(&rand, 1);
-
-    for (y = 0; y < H; y++)
-        for (x = 0; x < W * 4; x++)
-            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
-    sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride);
-    sws_freeContext(sws);
-    av_free(rgb_data);
-
     for (i = 1; i < argc; i += 2) {
         if (argv[i][0] != '-' || i + 1 == argc)
             goto bad_option;
         if (!strcmp(argv[i], "-ref")) {
-            FILE *fp = fopen(argv[i + 1], "r");
+            fp = fopen(argv[i + 1], "r");
             if (!fp) {
                 fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
                 goto error;
             }
-            res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
-            fclose(fp);
-            goto end;
         } else if (!strcmp(argv[i], "-src")) {
             srcFormat = av_get_pix_fmt(argv[i + 1]);
             if (srcFormat == AV_PIX_FMT_NONE) {
@@ -399,9 +389,25 @@ bad_option:
         }
     }
 
-    selfTest(src, stride, W, H, srcFormat, dstFormat);
-end:
-    res = 0;
+    sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
+                         AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
+
+    av_lfg_init(&rand, 1);
+
+    for (y = 0; y < H; y++)
+        for (x = 0; x < W * 4; x++)
+            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
+    sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride);
+    sws_freeContext(sws);
+    av_free(rgb_data);
+
+    if(fp) {
+        res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
+        fclose(fp);
+    } else {
+        selfTest(src, stride, W, H, srcFormat, dstFormat);
+        res = 0;
+    }
 error:
     av_free(data);
 
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index c1920de0a6..632e85a44f 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/avutil.h"
 #include "libavutil/bswap.h"
 #include "libavutil/cpu.h"
@@ -61,28 +62,6 @@ static av_always_inline void fillPlane(uint8_t *plane, int stride, int width,
     }
 }
 
-static void fill_plane9or10(uint8_t *plane, int stride, int width,
-                            int height, int y, uint8_t val,
-                            const int dst_depth, const int big_endian)
-{
-    int i, j;
-    uint16_t *dst = (uint16_t *) (plane + stride * y);
-#define FILL8TO9_OR_10(wfunc) \
-    for (i = 0; i < height; i++) { \
-        for (j = 0; j < width; j++) { \
-            wfunc(&dst[j], (val << (dst_depth - 8)) |  \
-                               (val >> (16 - dst_depth))); \
-        } \
-        dst += stride / 2; \
-    }
-    if (big_endian) {
-        FILL8TO9_OR_10(AV_WB16);
-    } else {
-        FILL8TO9_OR_10(AV_WL16);
-    }
-}
-
-
 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW,
                            const uint8_t *_src, const int16_t *filter,
                            const int32_t *filterPos, int filterSize)
@@ -94,6 +73,9 @@ static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW,
     int bits            = desc->comp[0].depth_minus1;
     int sh              = bits - 4;
 
+    if((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth_minus1<15)
+        sh= 9;
+
     for (i = 0; i < dstW; i++) {
         int j;
         int srcPos = filterPos[i];
@@ -116,6 +98,9 @@ static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW,
     const uint16_t *src = (const uint16_t *) _src;
     int sh              = desc->comp[0].depth_minus1;
 
+    if(sh<15)
+        sh= isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 : desc->comp[0].depth_minus1;
+
     for (i = 0; i < dstW; i++) {
         int j;
         int srcPos = filterPos[i];
@@ -232,7 +217,7 @@ static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
     int i;
     int32_t *dst = (int32_t *) _dst;
     for (i = 0; i < width; i++)
-        dst[i] = (dst[i] * 14071 + (33561947 << 4)) >> 14;
+        dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
 }
 
 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
@@ -246,6 +231,8 @@ static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
         dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha;
         xpos  += xInc;
     }
+    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+        dst[i] = src[srcW-1]*128;
 }
 
 // *** horizontal scale Y line to temp buffer
@@ -258,13 +245,13 @@ static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
                                      uint8_t *formatConvBuffer,
                                      uint32_t *pal, int isAlpha)
 {
-    void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) =
+    void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) =
         isAlpha ? c->alpToYV12 : c->lumToYV12;
     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
     const uint8_t *src = src_in[isAlpha ? 3 : 0];
 
     if (toYV12) {
-        toYV12(formatConvBuffer, src, srcW, pal);
+        toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal);
         src = formatConvBuffer;
     } else if (c->readLumPlanar && !isAlpha) {
         c->readLumPlanar(formatConvBuffer, src_in, srcW);
@@ -295,6 +282,10 @@ static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
         dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha);
         xpos   += xInc;
     }
+    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
+        dst1[i] = src1[srcW-1]*128;
+        dst2[i] = src2[srcW-1]*128;
+    }
 }
 
 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1,
@@ -309,13 +300,13 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1,
     const uint8_t *src1 = src_in[1], *src2 = src_in[2];
     if (c->chrToYV12) {
         uint8_t *buf2 = formatConvBuffer +
-                        FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
-        c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
-        src1 = formatConvBuffer;
-        src2 = buf2;
+                        FFALIGN(srcW*2+78, 16);
+        c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal);
+        src1= formatConvBuffer;
+        src2= buf2;
     } else if (c->readChrPlanar) {
         uint8_t *buf2 = formatConvBuffer +
-                        FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
+                        FFALIGN(srcW*2+78, 16);
         c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
         src1 = formatConvBuffer;
         src2 = buf2;
@@ -356,8 +347,6 @@ static int swScale(SwsContext *c, const uint8_t *src[],
     int32_t *vChrFilterPos           = c->vChrFilterPos;
     int32_t *hLumFilterPos           = c->hLumFilterPos;
     int32_t *hChrFilterPos           = c->hChrFilterPos;
-    int16_t *vLumFilter              = c->vLumFilter;
-    int16_t *vChrFilter              = c->vChrFilter;
     int16_t *hLumFilter              = c->hLumFilter;
     int16_t *hChrFilter              = c->hChrFilter;
     int32_t *lumMmxFilter            = c->lumMmxFilter;
@@ -416,8 +405,8 @@ static int swScale(SwsContext *c, const uint8_t *src[],
     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
                   vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
 
-    if (dstStride[0] % 8 != 0 || dstStride[1] % 8 != 0 ||
-        dstStride[2] % 8 != 0 || dstStride[3] % 8 != 0) {
+    if (dstStride[0]%16 !=0 || dstStride[1]%16 !=0 ||
+        dstStride[2]%16 !=0 || dstStride[3]%16 != 0) {
         static int warnedAlready = 0; // FIXME maybe move this into the context
         if (flags & SWS_PRINT_INFO && !warnedAlready) {
             av_log(c, AV_LOG_WARNING,
@@ -427,6 +416,18 @@ static int swScale(SwsContext *c, const uint8_t *src[],
         }
     }
 
+    if ((int)dst[0]%16 || (int)dst[1]%16 || (int)dst[2]%16 || (int)src[0]%16 || (int)src[1]%16 || (int)src[2]%16
+        || dstStride[0]%16 || dstStride[1]%16 || dstStride[2]%16 || dstStride[3]%16
+        || srcStride[0]%16 || srcStride[1]%16 || srcStride[2]%16 || srcStride[3]%16
+    ) {
+        static int warnedAlready=0;
+        int cpu_flags = av_get_cpu_flags();
+        if (HAVE_MMXEXT && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
+            av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n");
+            warnedAlready=1;
+        }
+    }
+
     /* Note the user might start scaling the picture in the middle so this
      * will not get executed. This is not really intended but works
      * currently, so people might do it. */
@@ -451,6 +452,7 @@ static int swScale(SwsContext *c, const uint8_t *src[],
             dst[2] + dstStride[2] * chrDstY,
             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
         };
+        int use_mmx_vfilter= c->use_mmx_vfilter;
 
         // First line needed as input
         const int firstLumSrcY  = FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]);
@@ -469,8 +471,8 @@ static int swScale(SwsContext *c, const uint8_t *src[],
             lastInLumBuf = firstLumSrcY - 1;
         if (firstChrSrcY > lastInChrBuf)
             lastInChrBuf = firstChrSrcY - 1;
-        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
-        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
+        av_assert0(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
+        av_assert0(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
 
         DEBUG_BUFFERS("dstY: %d\n", dstY);
         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
@@ -498,9 +500,9 @@ static int swScale(SwsContext *c, const uint8_t *src[],
                 src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3],
             };
             lumBufIndex++;
-            assert(lumBufIndex < 2 * vLumBufSize);
-            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
-            assert(lastInLumBuf + 1 - srcSliceY >= 0);
+            av_assert0(lumBufIndex < 2 * vLumBufSize);
+            av_assert0(lastInLumBuf + 1 - srcSliceY < srcSliceH);
+            av_assert0(lastInLumBuf + 1 - srcSliceY >= 0);
             hyscale(c, lumPixBuf[lumBufIndex], dstW, src1, srcW, lumXInc,
                     hLumFilter, hLumFilterPos, hLumFilterSize,
                     formatConvBuffer, pal, 0);
@@ -520,9 +522,9 @@ static int swScale(SwsContext *c, const uint8_t *src[],
                 src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3],
             };
             chrBufIndex++;
-            assert(chrBufIndex < 2 * vChrBufSize);
-            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
-            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
+            av_assert0(chrBufIndex < 2 * vChrBufSize);
+            av_assert0(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
+            av_assert0(lastInChrBuf + 1 - chrSrcSliceY >= 0);
             // FIXME replace parameters through context struct (some at least)
 
             if (c->needs_hcscale)
@@ -555,103 +557,81 @@ static int swScale(SwsContext *c, const uint8_t *src[],
              * this array's tail */
             ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
                                      &yuv2packed1, &yuv2packed2, &yuv2packedX);
+            use_mmx_vfilter= 0;
         }
 
         {
-            const int16_t **lumSrcPtr  = (const int16_t **)lumPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-            const int16_t **chrUSrcPtr = (const int16_t **)chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-            const int16_t **chrVSrcPtr = (const int16_t **)chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+            const int16_t **lumSrcPtr  = (const int16_t **)(void*) lumPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+            const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+            const int16_t **chrVSrcPtr = (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
             const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
-                                         (const int16_t **)alpPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
-
-            if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
-                const int16_t **tmpY = (const int16_t **)lumPixBuf +
-                                       2 * vLumBufSize;
-                int neg = -firstLumSrcY, i;
-                int end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
-                for (i = 0; i < neg; i++)
-                    tmpY[i] = lumSrcPtr[neg];
-                for (; i < end; i++)
-                    tmpY[i] = lumSrcPtr[i];
-                for (; i < vLumFilterSize; i++)
-                    tmpY[i] = tmpY[i - 1];
-                lumSrcPtr = tmpY;
-
-                if (alpSrcPtr) {
-                    const int16_t **tmpA = (const int16_t **)alpPixBuf +
-                                           2 * vLumBufSize;
-                    for (i = 0; i < neg; i++)
-                        tmpA[i] = alpSrcPtr[neg];
-                    for (; i < end; i++)
-                        tmpA[i] = alpSrcPtr[i];
-                    for (; i < vLumFilterSize; i++)
-                        tmpA[i] = tmpA[i - 1];
-                    alpSrcPtr = tmpA;
-                }
-            }
-            if (firstChrSrcY < 0 ||
-                firstChrSrcY + vChrFilterSize > c->chrSrcH) {
-                const int16_t **tmpU = (const int16_t **)chrUPixBuf + 2 * vChrBufSize,
-                **tmpV               = (const int16_t **)chrVPixBuf + 2 * vChrBufSize;
-                int neg = -firstChrSrcY, i;
-                int end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
-                for (i = 0; i < neg; i++) {
-                    tmpU[i] = chrUSrcPtr[neg];
-                    tmpV[i] = chrVSrcPtr[neg];
-                }
-                for (; i < end; i++) {
-                    tmpU[i] = chrUSrcPtr[i];
-                    tmpV[i] = chrVSrcPtr[i];
-                }
-                for (; i < vChrFilterSize; i++) {
-                    tmpU[i] = tmpU[i - 1];
-                    tmpV[i] = tmpV[i - 1];
-                }
-                chrUSrcPtr = tmpU;
-                chrVSrcPtr = tmpV;
-            }
+                                         (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+            int16_t *vLumFilter = c->vLumFilter;
+            int16_t *vChrFilter = c->vChrFilter;
 
             if (isPlanarYUV(dstFormat) ||
                 (isGray(dstFormat) && !isALPHA(dstFormat))) { // YV12 like
                 const int chrSkipMask = (1 << c->chrDstVSubSample) - 1;
 
+                vLumFilter +=    dstY * vLumFilterSize;
+                vChrFilter += chrDstY * vChrFilterSize;
+
+//                 av_assert0(use_mmx_vfilter != (
+//                                yuv2planeX == yuv2planeX_10BE_c
+//                             || yuv2planeX == yuv2planeX_10LE_c
+//                             || yuv2planeX == yuv2planeX_9BE_c
+//                             || yuv2planeX == yuv2planeX_9LE_c
+//                             || yuv2planeX == yuv2planeX_16BE_c
+//                             || yuv2planeX == yuv2planeX_16LE_c
+//                             || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
+
+                if(use_mmx_vfilter){
+                    vLumFilter= c->lumMmxFilter;
+                    vChrFilter= c->chrMmxFilter;
+                }
+
                 if (vLumFilterSize == 1) {
                     yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
                 } else {
-                    yuv2planeX(vLumFilter + dstY * vLumFilterSize,
-                               vLumFilterSize, lumSrcPtr, dest[0],
+                    yuv2planeX(vLumFilter, vLumFilterSize,
+                               lumSrcPtr, dest[0],
                                dstW, c->lumDither8, 0);
                 }
 
                 if (!((dstY & chrSkipMask) || isGray(dstFormat))) {
                     if (yuv2nv12cX) {
-                        yuv2nv12cX(c, vChrFilter + chrDstY * vChrFilterSize,
+                        yuv2nv12cX(c, vChrFilter,
                                    vChrFilterSize, chrUSrcPtr, chrVSrcPtr,
                                    dest[1], chrDstW);
                     } else if (vChrFilterSize == 1) {
                         yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
                         yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
                     } else {
-                        yuv2planeX(vChrFilter + chrDstY * vChrFilterSize,
+                        yuv2planeX(vChrFilter,
                                    vChrFilterSize, chrUSrcPtr, dest[1],
                                    chrDstW, c->chrDither8, 0);
-                        yuv2planeX(vChrFilter + chrDstY * vChrFilterSize,
+                        yuv2planeX(vChrFilter,
                                    vChrFilterSize, chrVSrcPtr, dest[2],
-                                   chrDstW, c->chrDither8, 3);
+                                   chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
                     }
                 }
 
                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                    if(use_mmx_vfilter){
+                        vLumFilter= c->alpMmxFilter;
+                    }
                     if (vLumFilterSize == 1) {
                         yuv2plane1(alpSrcPtr[0], dest[3], dstW,
                                    c->lumDither8, 0);
                     } else {
-                        yuv2planeX(vLumFilter + dstY * vLumFilterSize,
+                        yuv2planeX(vLumFilter,
                                    vLumFilterSize, alpSrcPtr, dest[3],
                                    dstW, c->lumDither8, 0);
                     }
                 }
             } else {
+                av_assert1(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize * 2);
+                av_assert1(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize * 2);
                 if (c->yuv2packed1 && vLumFilterSize == 1 &&
                     vChrFilterSize <= 2) { // unscaled RGB
                     int chrAlpha = vChrFilterSize == 1 ? 0 : vChrFilter[2 * dstY + 1];
@@ -679,18 +659,15 @@ static int swScale(SwsContext *c, const uint8_t *src[],
             }
         }
     }
-
     if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf) {
         int length = dstW;
         int height = dstY - lastDstY;
-        if (is16BPS(c->dstFormat))
-            length *= 2;
 
-        if (is9_OR_10BPS(dstFormat)) {
+        if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
             const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
-            fill_plane9or10(dst[3], dstStride[3], length, height, lastDstY,
-                            255, desc->comp[3].depth_minus1 + 1,
-                            isBE(dstFormat));
+            fillPlane16(dst[3], dstStride[3], length, height, lastDstY,
+                    1, desc->comp[3].depth_minus1,
+                    isBE(dstFormat));
         } else
             fillPlane(dst[3], dstStride[3], length, height, lastDstY, 255);
     }
@@ -721,8 +698,9 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
 
     ff_sws_init_input_funcs(c);
 
+
     if (c->srcBpc == 8) {
-        if (c->dstBpc <= 10) {
+        if (c->dstBpc <= 14) {
             c->hyScale = c->hcScale = hScale8To15_c;
             if (c->flags & SWS_FAST_BILINEAR) {
                 c->hyscale_fast = hyscale_fast_c;
@@ -732,12 +710,12 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
             c->hyScale = c->hcScale = hScale8To19_c;
         }
     } else {
-        c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c
+        c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_c
                                                  : hScale16To15_c;
     }
 
     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
-        if (c->dstBpc <= 10) {
+        if (c->dstBpc <= 14) {
             if (c->srcRange) {
                 c->lumConvertRange = lumRangeFromJpeg_c;
                 c->chrConvertRange = chrRangeFromJpeg_c;
@@ -772,3 +750,204 @@ SwsFunc ff_getSwsFunc(SwsContext *c)
 
     return swScale;
 }
+
+static void reset_ptr(const uint8_t *src[], int format)
+{
+    if (!isALPHA(format))
+        src[3] = NULL;
+    if (!isPlanar(format)) {
+        src[3] = src[2] = NULL;
+
+        if (!usePal(format))
+            src[1] = NULL;
+    }
+}
+
+static int check_image_pointers(const uint8_t * const data[4], enum AVPixelFormat pix_fmt,
+                                const int linesizes[4])
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        int plane = desc->comp[i].plane;
+        if (!data[plane] || !linesizes[plane])
+            return 0;
+    }
+
+    return 1;
+}
+
+/**
+ * swscale wrapper, so we don't need to export the SwsContext.
+ * Assumes planar YUV to be in YUV order instead of YVU.
+ */
+int attribute_align_arg sws_scale(struct SwsContext *c,
+                                  const uint8_t * const srcSlice[],
+                                  const int srcStride[], int srcSliceY,
+                                  int srcSliceH, uint8_t *const dst[],
+                                  const int dstStride[])
+{
+    int i, ret;
+    const uint8_t *src2[4] = { srcSlice[0], srcSlice[1], srcSlice[2], srcSlice[3] };
+    uint8_t *dst2[4] = { dst[0], dst[1], dst[2], dst[3] };
+    uint8_t *rgb0_tmp = NULL;
+
+    // do not mess up sliceDir if we have a "trailing" 0-size slice
+    if (srcSliceH == 0)
+        return 0;
+
+    if (!check_image_pointers(srcSlice, c->srcFormat, srcStride)) {
+        av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
+        return 0;
+    }
+    if (!check_image_pointers((const uint8_t* const*)dst, c->dstFormat, dstStride)) {
+        av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
+        return 0;
+    }
+
+    if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
+        av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
+        return 0;
+    }
+    if (c->sliceDir == 0) {
+        if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1;
+    }
+
+    if (usePal(c->srcFormat)) {
+        for (i = 0; i < 256; i++) {
+            int p, r, g, b, y, u, v, a = 0xff;
+            if (c->srcFormat == AV_PIX_FMT_PAL8) {
+                p = ((const uint32_t *)(srcSlice[1]))[i];
+                a = (p >> 24) & 0xFF;
+                r = (p >> 16) & 0xFF;
+                g = (p >>  8) & 0xFF;
+                b =  p        & 0xFF;
+            } else if (c->srcFormat == AV_PIX_FMT_RGB8) {
+                r = ( i >> 5     ) * 36;
+                g = ((i >> 2) & 7) * 36;
+                b = ( i       & 3) * 85;
+            } else if (c->srcFormat == AV_PIX_FMT_BGR8) {
+                b = ( i >> 6     ) * 85;
+                g = ((i >> 3) & 7) * 36;
+                r = ( i       & 7) * 36;
+            } else if (c->srcFormat == AV_PIX_FMT_RGB4_BYTE) {
+                r = ( i >> 3     ) * 255;
+                g = ((i >> 1) & 3) * 85;
+                b = ( i       & 1) * 255;
+            } else if (c->srcFormat == AV_PIX_FMT_GRAY8 || c->srcFormat == AV_PIX_FMT_GRAY8A) {
+                r = g = b = i;
+            } else {
+                av_assert1(c->srcFormat == AV_PIX_FMT_BGR4_BYTE);
+                b = ( i >> 3     ) * 255;
+                g = ((i >> 1) & 3) * 85;
+                r = ( i       & 1) * 255;
+            }
+#define RGB2YUV_SHIFT 15
+#define BY ( (int) (0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define BV (-(int) (0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define BU ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GY ( (int) (0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GV (-(int) (0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GU (-(int) (0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RY ( (int) (0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RV ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RU (-(int) (0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
+
+            y = av_clip_uint8((RY * r + GY * g + BY * b + ( 33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+            u = av_clip_uint8((RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+            v = av_clip_uint8((RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+            c->pal_yuv[i]= y + (u<<8) + (v<<16) + ((unsigned)a<<24);
+
+            switch (c->dstFormat) {
+            case AV_PIX_FMT_BGR32:
+#if !HAVE_BIGENDIAN
+            case AV_PIX_FMT_RGB24:
+#endif
+                c->pal_rgb[i]=  r + (g<<8) + (b<<16) + ((unsigned)a<<24);
+                break;
+            case AV_PIX_FMT_BGR32_1:
+#if HAVE_BIGENDIAN
+            case AV_PIX_FMT_BGR24:
+#endif
+                c->pal_rgb[i]= a + (r<<8) + (g<<16) + ((unsigned)b<<24);
+                break;
+            case AV_PIX_FMT_RGB32_1:
+#if HAVE_BIGENDIAN
+            case AV_PIX_FMT_RGB24:
+#endif
+                c->pal_rgb[i]= a + (b<<8) + (g<<16) + ((unsigned)r<<24);
+                break;
+            case AV_PIX_FMT_RGB32:
+#if !HAVE_BIGENDIAN
+            case AV_PIX_FMT_BGR24:
+#endif
+            default:
+                c->pal_rgb[i]=  b + (g<<8) + (r<<16) + ((unsigned)a<<24);
+            }
+        }
+    }
+
+    if (c->src0Alpha && !c->dst0Alpha && isALPHA(c->dstFormat)) {
+        uint8_t *base;
+        int x,y;
+        rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32);
+        base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp;
+        for (y=0; y<srcSliceH; y++){
+            memcpy(base + srcStride[0]*y, src2[0] + srcStride[0]*y, 4*c->srcW);
+            for (x=c->src0Alpha-1; x<4*c->srcW; x+=4) {
+                base[ srcStride[0]*y + x] = 0xFF;
+            }
+        }
+        src2[0] = base;
+    }
+
+    // copy strides, so they can safely be modified
+    if (c->sliceDir == 1) {
+        // slices go from top to bottom
+        int srcStride2[4] = { srcStride[0], srcStride[1], srcStride[2],
+                              srcStride[3] };
+        int dstStride2[4] = { dstStride[0], dstStride[1], dstStride[2],
+                              dstStride[3] };
+
+        reset_ptr(src2, c->srcFormat);
+        reset_ptr((void*)dst2, c->dstFormat);
+
+        /* reset slice direction at end of frame */
+        if (srcSliceY + srcSliceH == c->srcH)
+            c->sliceDir = 0;
+
+        ret = c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2,
+                          dstStride2);
+    } else {
+        // slices go from bottom to top => we flip the image internally
+        int srcStride2[4] = { -srcStride[0], -srcStride[1], -srcStride[2],
+                              -srcStride[3] };
+        int dstStride2[4] = { -dstStride[0], -dstStride[1], -dstStride[2],
+                              -dstStride[3] };
+
+        src2[0] += (srcSliceH - 1) * srcStride[0];
+        if (!usePal(c->srcFormat))
+            src2[1] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[1];
+        src2[2] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[2];
+        src2[3] += (srcSliceH - 1) * srcStride[3];
+        dst2[0] += ( c->dstH                         - 1) * dstStride[0];
+        dst2[1] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[1];
+        dst2[2] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[2];
+        dst2[3] += ( c->dstH                         - 1) * dstStride[3];
+
+        reset_ptr(src2, c->srcFormat);
+        reset_ptr((void*)dst2, c->dstFormat);
+
+        /* reset slice direction at end of frame */
+        if (!srcSliceY)
+            c->sliceDir = 0;
+
+        ret = c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH,
+                          srcSliceH, dst2, dstStride2);
+    }
+
+    av_free(rgb0_tmp);
+    return ret;
+}
+
diff --git a/libswscale/swscale.h b/libswscale/swscale.h
index 598946d807..ea2116c1fa 100644
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -217,7 +217,13 @@ int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
               uint8_t *const dst[], const int dstStride[]);
 
 /**
- * @param inv_table the yuv2rgb coefficients, normally ff_yuv2rgb_coeffs[x]
+ * @param dstRange flag indicating the while-black range of the output (1=jpeg / 0=mpeg)
+ * @param srcRange flag indicating the while-black range of the input (1=jpeg / 0=mpeg)
+ * @param table the yuv2rgb coefficients describing the output yuv space, normally ff_yuv2rgb_coeffs[x]
+ * @param inv_table the yuv2rgb coefficients describing the input yuv space, normally ff_yuv2rgb_coeffs[x]
+ * @param brightness 16.16 fixed point brightness correction
+ * @param contrast 16.16 fixed point contrast correction
+ * @param saturation 16.16 fixed point saturation correction
  * @return -1 if not supported
  */
 int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index c239c2a3ee..1dc7242a78 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,16 +30,21 @@
 #include "libavutil/avassert.h"
 #include "libavutil/avutil.h"
 #include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
 #include "libavutil/log.h"
 #include "libavutil/pixfmt.h"
 #include "libavutil/pixdesc.h"
 
 #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
 
+#define YUVRGB_TABLE_HEADROOM 128
+
 #define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
 
 #define MAX_FILTER_SIZE 256
 
+#define DITHER1XBPP
+
 #if HAVE_BIGENDIAN
 #define ALT32_CORR (-1)
 #else
@@ -317,10 +322,10 @@ typedef struct SwsContext {
     int dstY;                     ///< Last destination vertical line output from last slice.
     int flags;                    ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
     void *yuvTable;             // pointer to the yuv->rgb table start so it can be freed()
-    uint8_t *table_rV[256];
-    uint8_t *table_gU[256];
-    int table_gV[256];
-    uint8_t *table_bU[256];
+    uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
+    int table_gV[256 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
 
     //Colorspace stuff
     int contrast, brightness, saturation;    // for sws_getColorspaceDetails
@@ -328,6 +333,8 @@ typedef struct SwsContext {
     int dstColorspaceTable[4];
     int srcRange;                 ///< 0 = MPG YUV range, 1 = JPG YUV range (source      image).
     int dstRange;                 ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image).
+    int src0Alpha;
+    int dst0Alpha;
     int yuv2rgb_y_offset;
     int yuv2rgb_y_coeff;
     int yuv2rgb_v2r_coeff;
@@ -384,8 +391,8 @@ typedef struct SwsContext {
     // alignment of these values is not necessary, but merely here
     // to maintain the same offset across x8632 and x86-64. Once we
     // use proper offset macros in the asm, they can be removed.
-    DECLARE_ALIGNED(8, ptrdiff_t, uv_off_px);   ///< offset (in pixels) between u and v planes
-    DECLARE_ALIGNED(8, ptrdiff_t, uv_off_byte); ///< offset (in bytes) between u and v planes
+    DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
+    DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes
     DECLARE_ALIGNED(8, uint16_t, dither16)[8];
     DECLARE_ALIGNED(8, uint32_t, dither32)[8];
 
@@ -419,6 +426,7 @@ typedef struct SwsContext {
 #if HAVE_VIS
     DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10];
 #endif
+    int use_mmx_vfilter;
 
     /* function pointers for swScale() */
     yuv2planar1_fn yuv2plane1;
@@ -429,14 +437,14 @@ typedef struct SwsContext {
     yuv2packedX_fn yuv2packedX;
 
     /// Unscaled conversion of luma plane to YV12 for horizontal scaler.
-    void (*lumToYV12)(uint8_t *dst, const uint8_t *src,
+    void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
                       int width, uint32_t *pal);
     /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
-    void (*alpToYV12)(uint8_t *dst, const uint8_t *src,
+    void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
                       int width, uint32_t *pal);
     /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
     void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
-                      const uint8_t *src1, const uint8_t *src2,
+                      const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
                       int width, uint32_t *pal);
 
     /**
@@ -541,7 +549,13 @@ SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c);
 SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c);
 void ff_bfin_get_unscaled_swscale(SwsContext *c);
 
+#if FF_API_SWS_FORMAT_NAME
+/**
+ * @deprecated Use av_get_pix_fmt_name() instead.
+ */
+attribute_deprecated
 const char *sws_format_name(enum AVPixelFormat format);
+#endif
 
 static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
 {
@@ -554,9 +568,11 @@ static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
     av_assert0(desc);
-    return desc->comp[0].depth_minus1 == 8 || desc->comp[0].depth_minus1 == 9;
+    return desc->comp[0].depth_minus1 >= 8 && desc->comp[0].depth_minus1 <= 13;
 }
 
+#define isNBPS(x) is9_OR_10BPS(x)
+
 static av_always_inline int isBE(enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
@@ -587,8 +603,8 @@ static av_always_inline int isRGB(enum AVPixelFormat pix_fmt)
 
 #if 0 // FIXME
 #define isGray(x) \
-    (!(av_pix_fmt_descriptors[x].flags & PIX_FMT_PAL) && \
-     av_pix_fmt_descriptors[x].nb_components <= 2)
+    (!(av_pix_fmt_desc_get(x)->flags & PIX_FMT_PAL) && \
+     av_pix_fmt_desc_get(x)->nb_components <= 2)
 #else
 #define isGray(x)                      \
     ((x) == AV_PIX_FMT_GRAY8       ||  \
@@ -597,9 +613,12 @@ static av_always_inline int isRGB(enum AVPixelFormat pix_fmt)
      (x) == AV_PIX_FMT_GRAY16LE)
 #endif
 
-#define isRGBinInt(x)                  \
-    ((x) == AV_PIX_FMT_RGB48BE     ||  \
+#define isRGBinInt(x) \
+    (           \
+     (x) == AV_PIX_FMT_RGB48BE     ||  \
      (x) == AV_PIX_FMT_RGB48LE     ||  \
+     (x) == AV_PIX_FMT_RGBA64BE    ||  \
+     (x) == AV_PIX_FMT_RGBA64LE    ||  \
      (x) == AV_PIX_FMT_RGB32       ||  \
      (x) == AV_PIX_FMT_RGB32_1     ||  \
      (x) == AV_PIX_FMT_RGB24       ||  \
@@ -613,11 +632,14 @@ static av_always_inline int isRGB(enum AVPixelFormat pix_fmt)
      (x) == AV_PIX_FMT_RGB4        ||  \
      (x) == AV_PIX_FMT_RGB4_BYTE   ||  \
      (x) == AV_PIX_FMT_MONOBLACK   ||  \
-     (x) == AV_PIX_FMT_MONOWHITE)
-
-#define isBGRinInt(x)                  \
-    ((x) == AV_PIX_FMT_BGR48BE     ||  \
+     (x) == AV_PIX_FMT_MONOWHITE   \
+    )
+#define isBGRinInt(x) \
+    (           \
+     (x) == AV_PIX_FMT_BGR48BE     ||  \
      (x) == AV_PIX_FMT_BGR48LE     ||  \
+     (x) == AV_PIX_FMT_BGRA64BE    ||  \
+     (x) == AV_PIX_FMT_BGRA64LE    ||  \
      (x) == AV_PIX_FMT_BGR32       ||  \
      (x) == AV_PIX_FMT_BGR32_1     ||  \
      (x) == AV_PIX_FMT_BGR24       ||  \
@@ -631,11 +653,34 @@ static av_always_inline int isRGB(enum AVPixelFormat pix_fmt)
      (x) == AV_PIX_FMT_BGR4        ||  \
      (x) == AV_PIX_FMT_BGR4_BYTE   ||  \
      (x) == AV_PIX_FMT_MONOBLACK   ||  \
-     (x) == AV_PIX_FMT_MONOWHITE)
-
-#define isAnyRGB(x)                    \
-    (isRGBinInt(x)              ||     \
-     isBGRinInt(x))
+     (x) == AV_PIX_FMT_MONOWHITE   \
+    )
+
+#define isRGBinBytes(x) (           \
+           (x) == AV_PIX_FMT_RGB48BE     \
+        || (x) == AV_PIX_FMT_RGB48LE     \
+        || (x) == AV_PIX_FMT_RGBA64BE    \
+        || (x) == AV_PIX_FMT_RGBA64LE    \
+        || (x) == AV_PIX_FMT_RGBA        \
+        || (x) == AV_PIX_FMT_ARGB        \
+        || (x) == AV_PIX_FMT_RGB24       \
+    )
+#define isBGRinBytes(x) (           \
+           (x) == AV_PIX_FMT_BGR48BE     \
+        || (x) == AV_PIX_FMT_BGR48LE     \
+        || (x) == AV_PIX_FMT_BGRA64BE    \
+        || (x) == AV_PIX_FMT_BGRA64LE    \
+        || (x) == AV_PIX_FMT_BGRA        \
+        || (x) == AV_PIX_FMT_ABGR        \
+        || (x) == AV_PIX_FMT_BGR24       \
+    )
+
+#define isAnyRGB(x) \
+    (           \
+          isRGBinInt(x)       ||    \
+          isBGRinInt(x)       ||    \
+          (x)==AV_PIX_FMT_GBR24P     \
+    )
 
 static av_always_inline int isALPHA(enum AVPixelFormat pix_fmt)
 {
@@ -644,6 +689,16 @@ static av_always_inline int isALPHA(enum AVPixelFormat pix_fmt)
     return desc->nb_components == 2 || desc->nb_components == 4;
 }
 
+#if 1
+#define isPacked(x)         (       \
+           (x)==AV_PIX_FMT_PAL8        \
+        || (x)==AV_PIX_FMT_YUYV422     \
+        || (x)==AV_PIX_FMT_UYVY422     \
+        || (x)==AV_PIX_FMT_Y400A       \
+        ||  isRGBinInt(x)           \
+        ||  isBGRinInt(x)           \
+    )
+#else
 static av_always_inline int isPacked(enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
@@ -652,6 +707,7 @@ static av_always_inline int isPacked(enum AVPixelFormat pix_fmt)
             pix_fmt == AV_PIX_FMT_PAL8);
 }
 
+#endif
 static av_always_inline int isPlanar(enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
@@ -678,12 +734,14 @@ static av_always_inline int usePal(enum AVPixelFormat pix_fmt)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
     av_assert0(desc);
-    return ((desc->flags & PIX_FMT_PAL) || (desc->flags & PIX_FMT_PSEUDOPAL) ||
-            pix_fmt == AV_PIX_FMT_Y400A);
+    return (desc->flags & PIX_FMT_PAL) || (desc->flags & PIX_FMT_PSEUDOPAL);
 }
 
 extern const uint64_t ff_dither4[2];
 extern const uint64_t ff_dither8[2];
+extern const uint8_t dithers[8][8][8];
+extern const uint16_t dither_scale[15][16];
+
 
 extern const AVClass sws_context_class;
 
@@ -712,4 +770,24 @@ void ff_sws_init_output_funcs(SwsContext *c,
 void ff_sws_init_swScale_altivec(SwsContext *c);
 void ff_sws_init_swScale_mmx(SwsContext *c);
 
+static inline void fillPlane16(uint8_t *plane, int stride, int width, int height, int y,
+                               int alpha, int bits, const int big_endian)
+{
+    int i, j;
+    uint8_t *ptr = plane + stride * y;
+    int v = alpha ? 0xFFFF>>(15-bits) : (1<<bits);
+    for (i = 0; i < height; i++) {
+#define FILL(wfunc) \
+        for (j = 0; j < width; j++) {\
+            wfunc(ptr+2*j, v);\
+        }
+        if (big_endian) {
+            FILL(AV_WB16);
+        } else {
+            FILL(AV_WL16);
+        }
+        ptr += stride;
+    }
+}
+
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 9b919f8aa4..f35d1ba352 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,7 +23,6 @@
 #include <math.h>
 #include <stdio.h>
 #include "config.h"
-#include <assert.h>
 #include "swscale.h"
 #include "swscale_internal.h"
 #include "rgb2rgb.h"
@@ -33,59 +32,101 @@
 #include "libavutil/mathematics.h"
 #include "libavutil/bswap.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
 
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_1)[8][8] = {
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-};
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_3)[8][8] = {
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-};
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_64)[8][8] = {
-    {  18, 34, 30, 46, 17, 33, 29, 45,},
-    {  50,  2, 62, 14, 49,  1, 61, 13,},
-    {  26, 42, 22, 38, 25, 41, 21, 37,},
-    {  58, 10, 54,  6, 57,  9, 53,  5,},
-    {  16, 32, 28, 44, 19, 35, 31, 47,},
-    {  48,  0, 60, 12, 51,  3, 63, 15,},
-    {  24, 40, 20, 36, 27, 43, 23, 39,},
-    {  56,  8, 52,  4, 59, 11, 55,  7,},
-};
-extern const uint8_t dither_8x8_128[8][8];
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_256)[8][8] = {
-    {  72, 136, 120, 184,  68, 132, 116, 180,},
-    { 200,   8, 248,  56, 196,   4, 244,  52,},
-    { 104, 168,  88, 152, 100, 164,  84, 148,},
-    { 232,  40, 216,  24, 228,  36, 212,  20,},
-    {  64, 128, 102, 176,  76, 140, 124, 188,},
-    { 192,   0, 240,  48, 204,  12, 252,  60,},
-    {  96, 160,  80, 144, 108, 172,  92, 156,},
-    { 224,  32, 208,  16, 236,  44, 220,  28,},
+DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
+{
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+},{
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+},{
+  {   2,  4,  3,  5,  2,  4,  3,  5,},
+  {   6,  0,  7,  1,  6,  0,  7,  1,},
+  {   3,  5,  2,  4,  3,  5,  2,  4,},
+  {   7,  1,  6,  0,  7,  1,  6,  0,},
+  {   2,  4,  3,  5,  2,  4,  3,  5,},
+  {   6,  0,  7,  1,  6,  0,  7,  1,},
+  {   3,  5,  2,  4,  3,  5,  2,  4,},
+  {   7,  1,  6,  0,  7,  1,  6,  0,},
+},{
+  {   4,  8,  7, 11,  4,  8,  7, 11,},
+  {  12,  0, 15,  3, 12,  0, 15,  3,},
+  {   6, 10,  5,  9,  6, 10,  5,  9,},
+  {  14,  2, 13,  1, 14,  2, 13,  1,},
+  {   4,  8,  7, 11,  4,  8,  7, 11,},
+  {  12,  0, 15,  3, 12,  0, 15,  3,},
+  {   6, 10,  5,  9,  6, 10,  5,  9,},
+  {  14,  2, 13,  1, 14,  2, 13,  1,},
+},{
+  {   9, 17, 15, 23,  8, 16, 14, 22,},
+  {  25,  1, 31,  7, 24,  0, 30,  6,},
+  {  13, 21, 11, 19, 12, 20, 10, 18,},
+  {  29,  5, 27,  3, 28,  4, 26,  2,},
+  {   8, 16, 14, 22,  9, 17, 15, 23,},
+  {  24,  0, 30,  6, 25,  1, 31,  7,},
+  {  12, 20, 10, 18, 13, 21, 11, 19,},
+  {  28,  4, 26,  2, 29,  5, 27,  3,},
+},{
+  {  18, 34, 30, 46, 17, 33, 29, 45,},
+  {  50,  2, 62, 14, 49,  1, 61, 13,},
+  {  26, 42, 22, 38, 25, 41, 21, 37,},
+  {  58, 10, 54,  6, 57,  9, 53,  5,},
+  {  16, 32, 28, 44, 19, 35, 31, 47,},
+  {  48,  0, 60, 12, 51,  3, 63, 15,},
+  {  24, 40, 20, 36, 27, 43, 23, 39,},
+  {  56,  8, 52,  4, 59, 11, 55,  7,},
+},{
+  {  18, 34, 30, 46, 17, 33, 29, 45,},
+  {  50,  2, 62, 14, 49,  1, 61, 13,},
+  {  26, 42, 22, 38, 25, 41, 21, 37,},
+  {  58, 10, 54,  6, 57,  9, 53,  5,},
+  {  16, 32, 28, 44, 19, 35, 31, 47,},
+  {  48,  0, 60, 12, 51,  3, 63, 15,},
+  {  24, 40, 20, 36, 27, 43, 23, 39,},
+  {  56,  8, 52,  4, 59, 11, 55,  7,},
+},{
+  {  36, 68, 60, 92, 34, 66, 58, 90,},
+  { 100,  4,124, 28, 98,  2,122, 26,},
+  {  52, 84, 44, 76, 50, 82, 42, 74,},
+  { 116, 20,108, 12,114, 18,106, 10,},
+  {  32, 64, 56, 88, 38, 70, 62, 94,},
+  {  96,  0,120, 24,102,  6,126, 30,},
+  {  48, 80, 40, 72, 54, 86, 46, 78,},
+  { 112, 16,104,  8,118, 22,110, 14,},
+}};
+
+const uint16_t dither_scale[15][16]={
+{    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
+{    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
+{    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
+{    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
+{    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
+{    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
+{    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
+{    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
+{    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
+{    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
+{    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
+{    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
+{    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
+{    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
+{    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 };
 
-#define RGB2YUV_SHIFT 15
-#define BY ( (int) (0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
-#define BV (-(int) (0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
-#define BU ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
-#define GY ( (int) (0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
-#define GV (-(int) (0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
-#define GU (-(int) (0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
-#define RY ( (int) (0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
-#define RV ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
-#define RU (-(int) (0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
 
 static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
                       uint8_t val)
@@ -98,27 +139,6 @@ static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
     }
 }
 
-static void fill_plane9or10(uint8_t *plane, int stride, int width,
-                            int height, int y, uint8_t val,
-                            const int dst_depth, const int big_endian)
-{
-    int i, j;
-    uint16_t *dst = (uint16_t *) (plane + stride * y);
-#define FILL8TO9_OR_10(wfunc) \
-    for (i = 0; i < height; i++) { \
-        for (j = 0; j < width; j++) { \
-            wfunc(&dst[j], (val << (dst_depth - 8)) |  \
-                               (val >> (16 - dst_depth))); \
-        } \
-        dst += stride / 2; \
-    }
-    if (big_endian) {
-        FILL8TO9_OR_10(AV_WB16);
-    } else {
-        FILL8TO9_OR_10(AV_WL16);
-    }
-}
-
 static void copyPlane(const uint8_t *src, int srcStride,
                       int srcSliceY, int srcSliceH, int width,
                       uint8_t *dst, int dstStride)
@@ -331,7 +351,7 @@ static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
     uint8_t *dstPtr = dst[0] + dstStride[0] * srcSliceY;
     const uint8_t *srcPtr = src[0];
 
-    if (srcFormat == AV_PIX_FMT_Y400A) {
+    if (srcFormat == AV_PIX_FMT_GRAY8A) {
         switch (dstFormat) {
         case AV_PIX_FMT_RGB32  : conv = gray8aToPacked32; break;
         case AV_PIX_FMT_BGR32  : conv = gray8aToPacked32; break;
@@ -353,7 +373,7 @@ static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
 
     if (!conv)
         av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
-               sws_format_name(srcFormat), sws_format_name(dstFormat));
+               av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
     else {
         for (i = 0; i < srcSliceH; i++) {
             conv(srcPtr, dstPtr, c->srcW, (uint8_t *) c->pal_rgb);
@@ -475,6 +495,20 @@ static int planarRgbToRgbWrapper(SwsContext *c, const uint8_t *src[],
         || (x) == AV_PIX_FMT_ABGR   \
         )
 
+#define isRGBA64(x) (                \
+           (x) == AV_PIX_FMT_RGBA64LE   \
+        || (x) == AV_PIX_FMT_RGBA64BE   \
+        || (x) == AV_PIX_FMT_BGRA64LE   \
+        || (x) == AV_PIX_FMT_BGRA64BE   \
+        )
+
+#define isRGB48(x) (                \
+           (x) == AV_PIX_FMT_RGB48LE   \
+        || (x) == AV_PIX_FMT_RGB48BE   \
+        || (x) == AV_PIX_FMT_BGR48LE   \
+        || (x) == AV_PIX_FMT_BGR48BE   \
+        )
+
 /* {RGB,BGR}{15,16,24,32,32_1} -> {RGB,BGR}{15,16,24,32} */
 typedef void (* rgbConvFn) (const uint8_t *, uint8_t *, int);
 static rgbConvFn findRgbConvFn(SwsContext *c)
@@ -484,17 +518,11 @@ static rgbConvFn findRgbConvFn(SwsContext *c)
     const int srcId = c->srcFormatBpp;
     const int dstId = c->dstFormatBpp;
     rgbConvFn conv = NULL;
-    const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(srcFormat);
-    const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(dstFormat);
 
 #define IS_NOT_NE(bpp, desc) \
     (((bpp + 7) >> 3) == 2 && \
      (!(desc->flags & PIX_FMT_BE) != !HAVE_BIGENDIAN))
 
-    /* if this is non-native rgb444/555/565, don't handle it here. */
-    if (IS_NOT_NE(srcId, desc_src) || IS_NOT_NE(dstId, desc_dst))
-        return NULL;
-
 #define CONV_IS(src, dst) (srcFormat == AV_PIX_FMT_##src && dstFormat == AV_PIX_FMT_##dst)
 
     if (isRGBA32(srcFormat) && isRGBA32(dstFormat)) {
@@ -510,6 +538,32 @@ static rgbConvFn findRgbConvFn(SwsContext *c)
               || CONV_IS(RGBA, BGRA)) conv = shuffle_bytes_2103;
         else if (CONV_IS(BGRA, ABGR)
               || CONV_IS(RGBA, ARGB)) conv = shuffle_bytes_3012;
+    } else if (isRGB48(srcFormat) && isRGB48(dstFormat)) {
+        if      (CONV_IS(RGB48LE, BGR48LE)
+              || CONV_IS(BGR48LE, RGB48LE)
+              || CONV_IS(RGB48BE, BGR48BE)
+              || CONV_IS(BGR48BE, RGB48BE)) conv = rgb48tobgr48_nobswap;
+        else if (CONV_IS(RGB48LE, BGR48BE)
+              || CONV_IS(BGR48LE, RGB48BE)
+              || CONV_IS(RGB48BE, BGR48LE)
+              || CONV_IS(BGR48BE, RGB48LE)) conv = rgb48tobgr48_bswap;
+    } else if (isRGBA64(srcFormat) && isRGB48(dstFormat)) {
+        if      (CONV_IS(RGBA64LE, BGR48LE)
+              || CONV_IS(BGRA64LE, RGB48LE)
+              || CONV_IS(RGBA64BE, BGR48BE)
+              || CONV_IS(BGRA64BE, RGB48BE)) conv = rgb64tobgr48_nobswap;
+        else if (CONV_IS(RGBA64LE, BGR48BE)
+              || CONV_IS(BGRA64LE, RGB48BE)
+              || CONV_IS(RGBA64BE, BGR48LE)
+              || CONV_IS(BGRA64BE, RGB48LE)) conv = rgb64tobgr48_bswap;
+        else if (CONV_IS(RGBA64LE, RGB48LE)
+              || CONV_IS(BGRA64LE, BGR48LE)
+              || CONV_IS(RGBA64BE, RGB48BE)
+              || CONV_IS(BGRA64BE, BGR48BE)) conv = rgb64to48_nobswap;
+        else if (CONV_IS(RGBA64LE, RGB48BE)
+              || CONV_IS(BGRA64LE, BGR48BE)
+              || CONV_IS(RGBA64BE, RGB48LE)
+              || CONV_IS(BGRA64BE, BGR48LE)) conv = rgb64to48_bswap;
     } else
     /* BGR -> BGR */
     if ((isBGRinInt(srcFormat) && isBGRinInt(dstFormat)) ||
@@ -562,16 +616,21 @@ static int rgbToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
 {
     const enum AVPixelFormat srcFormat = c->srcFormat;
     const enum AVPixelFormat dstFormat = c->dstFormat;
+    const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(c->srcFormat);
+    const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(c->dstFormat);
     const int srcBpp = (c->srcFormatBpp + 7) >> 3;
     const int dstBpp = (c->dstFormatBpp + 7) >> 3;
     rgbConvFn conv = findRgbConvFn(c);
 
     if (!conv) {
         av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
-               sws_format_name(srcFormat), sws_format_name(dstFormat));
+               av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
     } else {
         const uint8_t *srcPtr = src[0];
               uint8_t *dstPtr = dst[0];
+        int src_bswap = IS_NOT_NE(c->srcFormatBpp, desc_src);
+        int dst_bswap = IS_NOT_NE(c->dstFormatBpp, desc_dst);
+
         if ((srcFormat == AV_PIX_FMT_RGB32_1 || srcFormat == AV_PIX_FMT_BGR32_1) &&
             !isRGBA32(dstFormat))
             srcPtr += ALT32_CORR;
@@ -581,15 +640,23 @@ static int rgbToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
             dstPtr += ALT32_CORR;
 
         if (dstStride[0] * srcBpp == srcStride[0] * dstBpp && srcStride[0] > 0 &&
-            !(srcStride[0] % srcBpp))
+            !(srcStride[0] % srcBpp) && !dst_bswap && !src_bswap)
             conv(srcPtr, dstPtr + dstStride[0] * srcSliceY,
                  srcSliceH * srcStride[0]);
         else {
-            int i;
+            int i, j;
             dstPtr += dstStride[0] * srcSliceY;
 
             for (i = 0; i < srcSliceH; i++) {
-                conv(srcPtr, dstPtr, c->srcW * srcBpp);
+                if(src_bswap) {
+                    for(j=0; j<c->srcW; j++)
+                        ((uint16_t*)c->formatConvBuffer)[j] = av_bswap16(((uint16_t*)srcPtr)[j]);
+                    conv(c->formatConvBuffer, dstPtr, c->srcW * srcBpp);
+                }else
+                    conv(srcPtr, dstPtr, c->srcW * srcBpp);
+                if(dst_bswap)
+                    for(j=0; j<c->srcW; j++)
+                        ((uint16_t*)dstPtr)[j] = av_bswap16(((uint16_t*)dstPtr)[j]);
                 srcPtr += srcStride[0];
                 dstPtr += dstStride[0];
             }
@@ -647,7 +714,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
         while (length + c->srcW <= FFABS(dstStride[0]) &&
                length + c->srcW <= FFABS(srcStride[0]))
             length += c->srcW;
-        assert(length != 0);
+        av_assert1(length != 0);
 
         for (i = 0; i < srcSliceH; i++) {
             memcpy(dstPtr, srcPtr, length);
@@ -658,25 +725,25 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }
 
-#define clip9(x)  av_clip_uintp2(x,  9)
-#define clip10(x) av_clip_uintp2(x, 10)
-#define DITHER_COPY(dst, dstStride, wfunc, src, srcStride, rfunc, dithers, shift, clip) \
-    for (i = 0; i < height; i++) { \
-        const uint8_t *dither = dithers[i & 7]; \
-        for (j = 0; j < length - 7; j += 8) { \
-            wfunc(&dst[j + 0], clip((rfunc(&src[j + 0]) + dither[0]) >> shift)); \
-            wfunc(&dst[j + 1], clip((rfunc(&src[j + 1]) + dither[1]) >> shift)); \
-            wfunc(&dst[j + 2], clip((rfunc(&src[j + 2]) + dither[2]) >> shift)); \
-            wfunc(&dst[j + 3], clip((rfunc(&src[j + 3]) + dither[3]) >> shift)); \
-            wfunc(&dst[j + 4], clip((rfunc(&src[j + 4]) + dither[4]) >> shift)); \
-            wfunc(&dst[j + 5], clip((rfunc(&src[j + 5]) + dither[5]) >> shift)); \
-            wfunc(&dst[j + 6], clip((rfunc(&src[j + 6]) + dither[6]) >> shift)); \
-            wfunc(&dst[j + 7], clip((rfunc(&src[j + 7]) + dither[7]) >> shift)); \
-        } \
-        for (; j < length; j++) \
-            wfunc(&dst[j],     (rfunc(&src[j]) + dither[j & 7]) >> shift); \
-        dst += dstStride; \
-        src += srcStride; \
+#define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
+    uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
+    int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
+    for (i = 0; i < height; i++) {\
+        const uint8_t *dither= dithers[src_depth-9][i&7];\
+        for (j = 0; j < length-7; j+=8){\
+            dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\
+            dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\
+            dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\
+            dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\
+            dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\
+            dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\
+            dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\
+            dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\
+        }\
+        for (; j < length; j++)\
+            dst[j] = dbswap((bswap(src[j]) + dither[j&7])*scale>>shift);\
+        dst += dstStride;\
+        src += srcStride;\
     }
 
 static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
@@ -692,169 +759,127 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
         int height = (plane == 0 || plane == 3) ? srcSliceH: -((-srcSliceH) >> c->chrDstVSubSample);
         const uint8_t *srcPtr = src[plane];
         uint8_t *dstPtr = dst[plane] + dstStride[plane] * y;
+        int shiftonly= plane==1 || plane==2 || (!c->srcRange && plane==0);
 
         if (!dst[plane])
             continue;
         // ignore palette for GRAY8
         if (plane == 1 && !dst[2]) continue;
         if (!src[plane] || (plane == 1 && !src[2])) {
-            int val = (plane == 3) ? 255 : 128;
-            if (is16BPS(c->dstFormat))
-                length *= 2;
-            if (is9_OR_10BPS(c->dstFormat)) {
-                fill_plane9or10(dst[plane], dstStride[plane],
-                                length, height, y, val,
-                                desc_dst->comp[plane].depth_minus1 + 1,
-                                isBE(c->dstFormat));
-            } else
+            if (is16BPS(c->dstFormat) || isNBPS(c->dstFormat)) {
+                fillPlane16(dst[plane], dstStride[plane], length, height, y,
+                        plane == 3, desc_dst->comp[plane].depth_minus1,
+                        isBE(c->dstFormat));
+            } else {
                 fillPlane(dst[plane], dstStride[plane], length, height, y,
-                          val);
+                        (plane == 3) ? 255 : 128);
+            }
         } else {
-            if (is9_OR_10BPS(c->srcFormat)) {
+            if(isNBPS(c->srcFormat) || isNBPS(c->dstFormat)
+               || (is16BPS(c->srcFormat) != is16BPS(c->dstFormat))
+            ) {
                 const int src_depth = desc_src->comp[plane].depth_minus1 + 1;
                 const int dst_depth = desc_dst->comp[plane].depth_minus1 + 1;
                 const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
+                uint16_t *dstPtr2 = (uint16_t*)dstPtr;
 
-                if (is16BPS(c->dstFormat)) {
-                    uint16_t *dstPtr2 = (uint16_t *) dstPtr;
-#define COPY9_OR_10TO16(rfunc, wfunc) \
-                    for (i = 0; i < height; i++) { \
-                        for (j = 0; j < length; j++) { \
-                            int srcpx = rfunc(&srcPtr2[j]); \
-                            wfunc(&dstPtr2[j], (srcpx << (16 - src_depth)) | (srcpx >> (2 * src_depth - 16))); \
-                        } \
-                        dstPtr2 += dstStride[plane] / 2; \
-                        srcPtr2 += srcStride[plane] / 2; \
+                if (dst_depth == 8) {
+                    if(isBE(c->srcFormat) == HAVE_BIGENDIAN){
+                        DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, , )
+                    } else {
+                        DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, av_bswap16, )
                     }
-                    if (isBE(c->dstFormat)) {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO16(AV_RB16, AV_WB16);
-                        } else {
-                            COPY9_OR_10TO16(AV_RL16, AV_WB16);
+                } else if (src_depth == 8) {
+                    for (i = 0; i < height; i++) {
+                        #define COPY816(w)\
+                        if(shiftonly){\
+                            for (j = 0; j < length; j++)\
+                                w(&dstPtr2[j], srcPtr[j]<<(dst_depth-8));\
+                        }else{\
+                            for (j = 0; j < length; j++)\
+                                w(&dstPtr2[j], (srcPtr[j]<<(dst_depth-8)) |\
+                                               (srcPtr[j]>>(2*8-dst_depth)));\
                         }
-                    } else {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO16(AV_RB16, AV_WL16);
+                        if(isBE(c->dstFormat)){
+                            COPY816(AV_WB16)
                         } else {
-                            COPY9_OR_10TO16(AV_RL16, AV_WL16);
+                            COPY816(AV_WL16)
                         }
+                        dstPtr2 += dstStride[plane]/2;
+                        srcPtr  += srcStride[plane];
                     }
-                } else if (is9_OR_10BPS(c->dstFormat)) {
-                    uint16_t *dstPtr2 = (uint16_t *) dstPtr;
-#define COPY9_OR_10TO9_OR_10(loop) \
-                    for (i = 0; i < height; i++) { \
-                        for (j = 0; j < length; j++) { \
-                            loop; \
-                        } \
-                        dstPtr2 += dstStride[plane] / 2; \
-                        srcPtr2 += srcStride[plane] / 2; \
-                    }
-#define COPY9_OR_10TO9_OR_10_2(rfunc, wfunc) \
-                    if (dst_depth > src_depth) { \
-                        COPY9_OR_10TO9_OR_10(int srcpx = rfunc(&srcPtr2[j]); \
-                            wfunc(&dstPtr2[j], (srcpx << 1) | (srcpx >> 9))); \
-                    } else if (dst_depth < src_depth) { \
-                        DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_1, 1, clip9); \
-                    } else { \
-                        COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]))); \
-                    }
-                    if (isBE(c->dstFormat)) {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WB16);
-                        } else {
-                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WB16);
+                } else if (src_depth <= dst_depth) {
+                    int orig_length = length;
+                    for (i = 0; i < height; i++) {
+                        if(isBE(c->srcFormat) == HAVE_BIGENDIAN &&
+                           isBE(c->dstFormat) == HAVE_BIGENDIAN &&
+                           shiftonly) {
+                             unsigned shift = dst_depth - src_depth;
+                             length = orig_length;
+#if HAVE_FAST_64BIT
+#define FAST_COPY_UP(shift) \
+    for (j = 0; j < length - 3; j += 4) { \
+        uint64_t v = AV_RN64A(srcPtr2 + j); \
+        AV_WN64A(dstPtr2 + j, v << shift); \
+    } \
+    length &= 3;
+#else
+#define FAST_COPY_UP(shift) \
+    for (j = 0; j < length - 1; j += 2) { \
+        uint32_t v = AV_RN32A(srcPtr2 + j); \
+        AV_WN32A(dstPtr2 + j, v << shift); \
+    } \
+    length &= 1;
+#endif
+                             switch (shift)
+                             {
+                             case 6: FAST_COPY_UP(6); break;
+                             case 7: FAST_COPY_UP(7); break;
+                             }
                         }
-                    } else {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WL16);
+#define COPY_UP(r,w) \
+    if(shiftonly){\
+        for (j = 0; j < length; j++){ \
+            unsigned int v= r(&srcPtr2[j]);\
+            w(&dstPtr2[j], v<<(dst_depth-src_depth));\
+        }\
+    }else{\
+        for (j = 0; j < length; j++){ \
+            unsigned int v= r(&srcPtr2[j]);\
+            w(&dstPtr2[j], (v<<(dst_depth-src_depth)) | \
+                        (v>>(2*src_depth-dst_depth)));\
+        }\
+    }
+                        if(isBE(c->srcFormat)){
+                            if(isBE(c->dstFormat)){
+                                COPY_UP(AV_RB16, AV_WB16)
+                            } else {
+                                COPY_UP(AV_RB16, AV_WL16)
+                            }
                         } else {
-                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WL16);
+                            if(isBE(c->dstFormat)){
+                                COPY_UP(AV_RL16, AV_WB16)
+                            } else {
+                                COPY_UP(AV_RL16, AV_WL16)
+                            }
                         }
+                        dstPtr2 += dstStride[plane]/2;
+                        srcPtr2 += srcStride[plane]/2;
                     }
                 } else {
-#define W8(a, b) { *(a) = (b); }
-#define COPY9_OR_10TO8(rfunc) \
-                    if (src_depth == 9) { \
-                        DITHER_COPY(dstPtr,  dstStride[plane],   W8, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_1, 1, av_clip_uint8); \
-                    } else { \
-                        DITHER_COPY(dstPtr,  dstStride[plane],   W8, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_3, 2, av_clip_uint8); \
-                    }
-                    if (isBE(c->srcFormat)) {
-                        COPY9_OR_10TO8(AV_RB16);
-                    } else {
-                        COPY9_OR_10TO8(AV_RL16);
-                    }
-                }
-            } else if (is9_OR_10BPS(c->dstFormat)) {
-                const int dst_depth = desc_dst->comp[plane].depth_minus1 + 1;
-                uint16_t *dstPtr2 = (uint16_t *) dstPtr;
-
-                if (is16BPS(c->srcFormat)) {
-                    const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
-#define COPY16TO9_OR_10(rfunc, wfunc) \
-                    if (dst_depth == 9) { \
-                        DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_128, 7, clip9); \
-                    } else { \
-                        DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_64, 6, clip10); \
-                    }
-                    if (isBE(c->dstFormat)) {
-                        if (isBE(c->srcFormat)) {
-                            COPY16TO9_OR_10(AV_RB16, AV_WB16);
+                    if(isBE(c->srcFormat) == HAVE_BIGENDIAN){
+                        if(isBE(c->dstFormat) == HAVE_BIGENDIAN){
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , )
                         } else {
-                            COPY16TO9_OR_10(AV_RL16, AV_WB16);
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , av_bswap16)
                         }
-                    } else {
-                        if (isBE(c->srcFormat)) {
-                            COPY16TO9_OR_10(AV_RB16, AV_WL16);
+                    }else{
+                        if(isBE(c->dstFormat) == HAVE_BIGENDIAN){
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, )
                         } else {
-                            COPY16TO9_OR_10(AV_RL16, AV_WL16);
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, av_bswap16)
                         }
                     }
-                } else /* 8bit */ {
-#define COPY8TO9_OR_10(wfunc) \
-                    for (i = 0; i < height; i++) { \
-                        for (j = 0; j < length; j++) { \
-                            const int srcpx = srcPtr[j]; \
-                            wfunc(&dstPtr2[j], (srcpx << (dst_depth - 8)) | (srcpx >> (16 - dst_depth))); \
-                        } \
-                        dstPtr2 += dstStride[plane] / 2; \
-                        srcPtr  += srcStride[plane]; \
-                    }
-                    if (isBE(c->dstFormat)) {
-                        COPY8TO9_OR_10(AV_WB16);
-                    } else {
-                        COPY8TO9_OR_10(AV_WL16);
-                    }
-                }
-            } else if (is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) {
-                const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
-#define COPY16TO8(rfunc) \
-                    DITHER_COPY(dstPtr,  dstStride[plane],   W8, \
-                                srcPtr2, srcStride[plane] / 2, rfunc, \
-                                dither_8x8_256, 8, av_clip_uint8);
-                if (isBE(c->srcFormat)) {
-                    COPY16TO8(AV_RB16);
-                } else {
-                    COPY16TO8(AV_RL16);
-                }
-            } else if (!is16BPS(c->srcFormat) && is16BPS(c->dstFormat)) {
-                for (i = 0; i < height; i++) {
-                    for (j = 0; j < length; j++) {
-                        dstPtr[ j << 1     ] = srcPtr[j];
-                        dstPtr[(j << 1) + 1] = srcPtr[j];
-                    }
-                    srcPtr += srcStride[plane];
-                    dstPtr += dstStride[plane];
                 }
             } else if (is16BPS(c->srcFormat) && is16BPS(c->dstFormat) &&
                       isBE(c->srcFormat) != isBE(c->dstFormat)) {
@@ -932,28 +957,32 @@ void ff_get_unscaled_swscale(SwsContext *c)
         && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
         c->swScale= rgbToRgbWrapper;
 
-    if (isPlanarRGB(srcFormat) && isPackedRGB(dstFormat))
+#define isByteRGB(f) (\
+        f == AV_PIX_FMT_RGB32   ||\
+        f == AV_PIX_FMT_RGB32_1 ||\
+        f == AV_PIX_FMT_RGB24   ||\
+        f == AV_PIX_FMT_BGR32   ||\
+        f == AV_PIX_FMT_BGR32_1 ||\
+        f == AV_PIX_FMT_BGR24)
+
+    if (isAnyRGB(srcFormat) && isPlanar(srcFormat) && isByteRGB(dstFormat))
         c->swScale = planarRgbToRgbWrapper;
 
     /* bswap 16 bits per pixel/component packed formats */
     if (IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR444) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR48)  ||
+        IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGRA64) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR555) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR565) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GRAY16) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB444) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB48)  ||
+        IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGBA64) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB555) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB565))
         c->swScale = packed_16bpc_bswap;
 
-    if ((usePal(srcFormat) && (
-        dstFormat == AV_PIX_FMT_RGB32   ||
-        dstFormat == AV_PIX_FMT_RGB32_1 ||
-        dstFormat == AV_PIX_FMT_RGB24   ||
-        dstFormat == AV_PIX_FMT_BGR32   ||
-        dstFormat == AV_PIX_FMT_BGR32_1 ||
-        dstFormat == AV_PIX_FMT_BGR24)))
+    if (usePal(srcFormat) && isByteRGB(dstFormat))
         c->swScale = palToRgbWrapper;
 
     if (srcFormat == AV_PIX_FMT_YUV422P) {
@@ -984,13 +1013,14 @@ void ff_get_unscaled_swscale(SwsContext *c)
     if (srcFormat == AV_PIX_FMT_UYVY422 && dstFormat == AV_PIX_FMT_YUV422P)
         c->swScale = uyvyToYuv422Wrapper;
 
+#define isPlanarGray(x) (isGray(x) && (x) != AV_PIX_FMT_GRAY8A)
     /* simple copy */
     if ( srcFormat == dstFormat ||
         (srcFormat == AV_PIX_FMT_YUVA420P && dstFormat == AV_PIX_FMT_YUV420P) ||
         (srcFormat == AV_PIX_FMT_YUV420P && dstFormat == AV_PIX_FMT_YUVA420P) ||
-        (isPlanarYUV(srcFormat) && isGray(dstFormat)) ||
-        (isPlanarYUV(dstFormat) && isGray(srcFormat)) ||
-        (isGray(dstFormat) && isGray(srcFormat)) ||
+        (isPlanarYUV(srcFormat) && isPlanarGray(dstFormat)) ||
+        (isPlanarYUV(dstFormat) && isPlanarGray(srcFormat)) ||
+        (isPlanarGray(dstFormat) && isPlanarGray(srcFormat)) ||
         (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat) &&
          c->chrDstHSubSample == c->chrSrcHSubSample &&
          c->chrDstVSubSample == c->chrSrcVSubSample &&
@@ -1009,177 +1039,6 @@ void ff_get_unscaled_swscale(SwsContext *c)
         ff_swscale_get_unscaled_altivec(c);
 }
 
-static void reset_ptr(const uint8_t *src[], int format)
-{
-    if (!isALPHA(format))
-        src[3] = NULL;
-    if (!isPlanar(format)) {
-        src[3] = src[2] = NULL;
-
-        if (!usePal(format))
-            src[1] = NULL;
-    }
-}
-
-static int check_image_pointers(uint8_t *data[4], enum AVPixelFormat pix_fmt,
-                                const int linesizes[4])
-{
-    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
-    int i;
-
-    for (i = 0; i < 4; i++) {
-        int plane = desc->comp[i].plane;
-        if (!data[plane] || !linesizes[plane])
-            return 0;
-    }
-
-    return 1;
-}
-
-/**
- * swscale wrapper, so we don't need to export the SwsContext.
- * Assumes planar YUV to be in YUV order instead of YVU.
- */
-int attribute_align_arg sws_scale(struct SwsContext *c,
-                                  const uint8_t * const srcSlice[],
-                                  const int srcStride[], int srcSliceY,
-                                  int srcSliceH, uint8_t *const dst[],
-                                  const int dstStride[])
-{
-    int i;
-    const uint8_t *src2[4] = { srcSlice[0], srcSlice[1], srcSlice[2], srcSlice[3] };
-    uint8_t *dst2[4] = { dst[0], dst[1], dst[2], dst[3] };
-
-    // do not mess up sliceDir if we have a "trailing" 0-size slice
-    if (srcSliceH == 0)
-        return 0;
-
-    if (!check_image_pointers(srcSlice, c->srcFormat, srcStride)) {
-        av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
-        return 0;
-    }
-    if (!check_image_pointers(dst, c->dstFormat, dstStride)) {
-        av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
-        return 0;
-    }
-
-    if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
-        av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
-        return 0;
-    }
-    if (c->sliceDir == 0) {
-        if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1;
-    }
-
-    if (usePal(c->srcFormat)) {
-        for (i = 0; i < 256; i++) {
-            int p, r, g, b, y, u, v;
-            if (c->srcFormat == AV_PIX_FMT_PAL8) {
-                p = ((const uint32_t *)(srcSlice[1]))[i];
-                r = (p >> 16) & 0xFF;
-                g = (p >>  8) & 0xFF;
-                b =  p        & 0xFF;
-            } else if (c->srcFormat == AV_PIX_FMT_RGB8) {
-                r = ( i >> 5     ) * 36;
-                g = ((i >> 2) & 7) * 36;
-                b = ( i       & 3) * 85;
-            } else if (c->srcFormat == AV_PIX_FMT_BGR8) {
-                b = ( i >> 6     ) * 85;
-                g = ((i >> 3) & 7) * 36;
-                r = ( i       & 7) * 36;
-            } else if (c->srcFormat == AV_PIX_FMT_RGB4_BYTE) {
-                r = ( i >> 3     ) * 255;
-                g = ((i >> 1) & 3) * 85;
-                b = ( i       & 1) * 255;
-            } else if (c->srcFormat == AV_PIX_FMT_GRAY8 ||
-                      c->srcFormat == AV_PIX_FMT_Y400A) {
-                r = g = b = i;
-            } else {
-                assert(c->srcFormat == AV_PIX_FMT_BGR4_BYTE);
-                b = ( i >> 3     ) * 255;
-                g = ((i >> 1) & 3) * 85;
-                r = ( i       & 1) * 255;
-            }
-            y = av_clip_uint8((RY * r + GY * g + BY * b + ( 33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-            u = av_clip_uint8((RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-            v = av_clip_uint8((RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-            c->pal_yuv[i] = y + (u << 8) + (v << 16);
-
-            switch (c->dstFormat) {
-            case AV_PIX_FMT_BGR32:
-#if !HAVE_BIGENDIAN
-            case AV_PIX_FMT_RGB24:
-#endif
-                c->pal_rgb[i] =  r + (g << 8) + (b << 16);
-                break;
-            case AV_PIX_FMT_BGR32_1:
-#if HAVE_BIGENDIAN
-            case AV_PIX_FMT_BGR24:
-#endif
-                c->pal_rgb[i] = (r + (g << 8) + (b << 16)) << 8;
-                break;
-            case AV_PIX_FMT_RGB32_1:
-#if HAVE_BIGENDIAN
-            case AV_PIX_FMT_RGB24:
-#endif
-                c->pal_rgb[i] = (b + (g << 8) + (r << 16)) << 8;
-                break;
-            case AV_PIX_FMT_RGB32:
-#if !HAVE_BIGENDIAN
-            case AV_PIX_FMT_BGR24:
-#endif
-            default:
-                c->pal_rgb[i] =  b + (g << 8) + (r << 16);
-            }
-        }
-    }
-
-    // copy strides, so they can safely be modified
-    if (c->sliceDir == 1) {
-        // slices go from top to bottom
-        int srcStride2[4] = { srcStride[0], srcStride[1], srcStride[2],
-                              srcStride[3] };
-        int dstStride2[4] = { dstStride[0], dstStride[1], dstStride[2],
-                              dstStride[3] };
-
-        reset_ptr(src2, c->srcFormat);
-        reset_ptr((const uint8_t **) dst2, c->dstFormat);
-
-        /* reset slice direction at end of frame */
-        if (srcSliceY + srcSliceH == c->srcH)
-            c->sliceDir = 0;
-
-        return c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2,
-                          dstStride2);
-    } else {
-        // slices go from bottom to top => we flip the image internally
-        int srcStride2[4] = { -srcStride[0], -srcStride[1], -srcStride[2],
-                              -srcStride[3] };
-        int dstStride2[4] = { -dstStride[0], -dstStride[1], -dstStride[2],
-                              -dstStride[3] };
-
-        src2[0] += (srcSliceH - 1) * srcStride[0];
-        if (!usePal(c->srcFormat))
-            src2[1] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[1];
-        src2[2] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[2];
-        src2[3] += (srcSliceH - 1) * srcStride[3];
-        dst2[0] += ( c->dstH                         - 1) * dstStride[0];
-        dst2[1] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[1];
-        dst2[2] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[2];
-        dst2[3] += ( c->dstH                         - 1) * dstStride[3];
-
-        reset_ptr(src2, c->srcFormat);
-        reset_ptr((const uint8_t **) dst2, c->dstFormat);
-
-        /* reset slice direction at end of frame */
-        if (!srcSliceY)
-            c->sliceDir = 0;
-
-        return c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH,
-                          srcSliceH, dst2, dstStride2);
-    }
-}
-
 /* Convert the palette to the same packed 32-bit format as the palette */
 void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst,
                                    int num_pixels, const uint8_t *palette)
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 9e5ebac72e..95d81d8f06 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1,27 +1,27 @@
 /*
  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 
 #define _SVID_SOURCE // needed for MAP_ANONYMOUS
-#include <assert.h>
+#define _DARWIN_C_SOURCE // needed for MAP_ANON
 #include <inttypes.h>
 #include <math.h>
 #include <stdio.h>
@@ -38,6 +38,7 @@
 #endif
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/avutil.h"
 #include "libavutil/bswap.h"
 #include "libavutil/cpu.h"
@@ -53,18 +54,19 @@
 
 unsigned swscale_version(void)
 {
+    av_assert0(LIBSWSCALE_VERSION_MICRO >= 100);
     return LIBSWSCALE_VERSION_INT;
 }
 
 const char *swscale_configuration(void)
 {
-    return LIBAV_CONFIGURATION;
+    return FFMPEG_CONFIGURATION;
 }
 
 const char *swscale_license(void)
 {
 #define LICENSE_PREFIX "libswscale license: "
-    return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1;
+    return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 }
 
 #define RET 0xC3 // near return opcode for x86
@@ -103,6 +105,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
     [AV_PIX_FMT_RGBA]        = { 1, 1 },
     [AV_PIX_FMT_ABGR]        = { 1, 1 },
     [AV_PIX_FMT_BGRA]        = { 1, 1 },
+    [AV_PIX_FMT_0RGB]        = { 1, 1 },
+    [AV_PIX_FMT_RGB0]        = { 1, 1 },
+    [AV_PIX_FMT_0BGR]        = { 1, 1 },
+    [AV_PIX_FMT_BGR0]        = { 1, 1 },
     [AV_PIX_FMT_GRAY16BE]    = { 1, 1 },
     [AV_PIX_FMT_GRAY16LE]    = { 1, 1 },
     [AV_PIX_FMT_YUV440P]     = { 1, 1 },
@@ -130,6 +136,8 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
     [AV_PIX_FMT_YUVA444P16LE]= { 1, 1 },
     [AV_PIX_FMT_RGB48BE]     = { 1, 1 },
     [AV_PIX_FMT_RGB48LE]     = { 1, 1 },
+    [AV_PIX_FMT_RGBA64BE]    = { 1, 0 },
+    [AV_PIX_FMT_RGBA64LE]    = { 1, 0 },
     [AV_PIX_FMT_RGB565BE]    = { 1, 1 },
     [AV_PIX_FMT_RGB565LE]    = { 1, 1 },
     [AV_PIX_FMT_RGB555BE]    = { 1, 1 },
@@ -151,23 +159,41 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
     [AV_PIX_FMT_Y400A]       = { 1, 0 },
     [AV_PIX_FMT_BGR48BE]     = { 1, 1 },
     [AV_PIX_FMT_BGR48LE]     = { 1, 1 },
+    [AV_PIX_FMT_BGRA64BE]    = { 0, 0 },
+    [AV_PIX_FMT_BGRA64LE]    = { 0, 0 },
     [AV_PIX_FMT_YUV420P9BE]  = { 1, 1 },
     [AV_PIX_FMT_YUV420P9LE]  = { 1, 1 },
     [AV_PIX_FMT_YUV420P10BE] = { 1, 1 },
     [AV_PIX_FMT_YUV420P10LE] = { 1, 1 },
+    [AV_PIX_FMT_YUV420P12BE] = { 1, 1 },
+    [AV_PIX_FMT_YUV420P12LE] = { 1, 1 },
+    [AV_PIX_FMT_YUV420P14BE] = { 1, 1 },
+    [AV_PIX_FMT_YUV420P14LE] = { 1, 1 },
     [AV_PIX_FMT_YUV422P9BE]  = { 1, 1 },
     [AV_PIX_FMT_YUV422P9LE]  = { 1, 1 },
     [AV_PIX_FMT_YUV422P10BE] = { 1, 1 },
     [AV_PIX_FMT_YUV422P10LE] = { 1, 1 },
+    [AV_PIX_FMT_YUV422P12BE] = { 1, 1 },
+    [AV_PIX_FMT_YUV422P12LE] = { 1, 1 },
+    [AV_PIX_FMT_YUV422P14BE] = { 1, 1 },
+    [AV_PIX_FMT_YUV422P14LE] = { 1, 1 },
     [AV_PIX_FMT_YUV444P9BE]  = { 1, 1 },
     [AV_PIX_FMT_YUV444P9LE]  = { 1, 1 },
     [AV_PIX_FMT_YUV444P10BE] = { 1, 1 },
     [AV_PIX_FMT_YUV444P10LE] = { 1, 1 },
+    [AV_PIX_FMT_YUV444P12BE] = { 1, 1 },
+    [AV_PIX_FMT_YUV444P12LE] = { 1, 1 },
+    [AV_PIX_FMT_YUV444P14BE] = { 1, 1 },
+    [AV_PIX_FMT_YUV444P14LE] = { 1, 1 },
     [AV_PIX_FMT_GBRP]        = { 1, 0 },
     [AV_PIX_FMT_GBRP9LE]     = { 1, 0 },
     [AV_PIX_FMT_GBRP9BE]     = { 1, 0 },
     [AV_PIX_FMT_GBRP10LE]    = { 1, 0 },
     [AV_PIX_FMT_GBRP10BE]    = { 1, 0 },
+    [AV_PIX_FMT_GBRP12LE]    = { 1, 0 },
+    [AV_PIX_FMT_GBRP12BE]    = { 1, 0 },
+    [AV_PIX_FMT_GBRP14LE]    = { 1, 0 },
+    [AV_PIX_FMT_GBRP14BE]    = { 1, 0 },
     [AV_PIX_FMT_GBRP16LE]    = { 1, 0 },
     [AV_PIX_FMT_GBRP16BE]    = { 1, 0 },
 };
@@ -186,6 +212,7 @@ int sws_isSupportedOutput(enum AVPixelFormat pix_fmt)
 
 extern const int32_t ff_yuv2rgb_coeffs[8][4];
 
+#if FF_API_SWS_FORMAT_NAME
 const char *sws_format_name(enum AVPixelFormat format)
 {
     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(format);
@@ -194,6 +221,7 @@ const char *sws_format_name(enum AVPixelFormat format)
     else
         return "Unknown format";
 }
+#endif
 
 static double getSplineCoeff(double a, double b, double c, double d,
                              double dist)
@@ -212,7 +240,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
                       int *outFilterSize, int xInc, int srcW, int dstW,
                       int filterAlign, int one, int flags, int cpu_flags,
                       SwsVector *srcFilter, SwsVector *dstFilter,
-                      double param[2], int is_horizontal)
+                      double param[2])
 {
     int i;
     int filterSize;
@@ -220,7 +248,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
     int minFilterSize;
     int64_t *filter    = NULL;
     int64_t *filter2   = NULL;
-    const int64_t fone = 1LL << 54;
+    const int64_t fone = 1LL << (54 - FFMIN(av_log2(srcW/dstW), 8));
     int ret            = -1;
 
     emms_c(); // FIXME should not be required but IS (even for non-MMX versions)
@@ -240,7 +268,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
         }
     } else if (flags & SWS_POINT) { // lame looking point sampling mode
         int i;
-        int xDstInSrc;
+        int64_t xDstInSrc;
         filterSize = 1;
         FF_ALLOC_OR_GOTO(NULL, filter,
                          dstW * sizeof(*filter) * filterSize, fail);
@@ -256,7 +284,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
     } else if ((xInc <= (1 << 16) && (flags & SWS_AREA)) ||
                (flags & SWS_FAST_BILINEAR)) { // bilinear upscale
         int i;
-        int xDstInSrc;
+        int64_t xDstInSrc;
         filterSize = 2;
         FF_ALLOC_OR_GOTO(NULL, filter,
                          dstW * sizeof(*filter) * filterSize, fail);
@@ -269,8 +297,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
             (*filterPos)[i] = xx;
             // bilinear upscale / linear interpolate / area averaging
             for (j = 0; j < filterSize; j++) {
-                int64_t coeff = fone - FFABS((xx << 16) - xDstInSrc) *
-                                (fone >> 16);
+                int64_t coeff= fone - FFABS(((int64_t)xx<<16) - xDstInSrc)*(fone>>16);
                 if (coeff < 0)
                     coeff = 0;
                 filter[i * filterSize + j] = coeff;
@@ -299,8 +326,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
         else if (flags & SWS_BILINEAR)
             sizeFactor = 2;
         else {
-            sizeFactor = 0;     // GCC warning killer
-            assert(0);
+            av_assert0(0);
         }
 
         if (xInc <= 1 << 16)
@@ -348,7 +374,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
                                     (-12 * B - 48 * C) * d   +
                                       (8 * B + 24 * C) * (1 << 30);
                     }
-                    coeff *= fone >> (30 + 24);
+                    coeff /= (1LL<<54)/fone;
                 }
 #if 0
                 else if (flags & SWS_X) {
@@ -399,8 +425,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
                     double p = -2.196152422706632;
                     coeff = getSplineCoeff(1.0, 0.0, p, -p - 1.0, floatd) * fone;
                 } else {
-                    coeff = 0.0; // GCC warning killer
-                    assert(0);
+                    av_assert0(0);
                 }
 
                 filter[i * filterSize + j] = coeff;
@@ -413,13 +438,13 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
     /* apply src & dst Filter to filter -> filter2
      * av_free(filter);
      */
-    assert(filterSize > 0);
+    av_assert0(filterSize > 0);
     filter2Size = filterSize;
     if (srcFilter)
         filter2Size += srcFilter->length - 1;
     if (dstFilter)
         filter2Size += dstFilter->length - 1;
-    assert(filter2Size > 0);
+    av_assert0(filter2Size > 0);
     FF_ALLOCZ_OR_GOTO(NULL, filter2, filter2Size * dstW * sizeof(*filter2), fail);
 
     for (i = 0; i < dstW; i++) {
@@ -501,13 +526,15 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
             filterAlign = 1;
     }
 
-    assert(minFilterSize > 0);
+    av_assert0(minFilterSize > 0);
     filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));
-    assert(filterSize > 0);
+    av_assert0(filterSize > 0);
     filter = av_malloc(filterSize * dstW * sizeof(*filter));
     if (filterSize >= MAX_FILTER_SIZE * 16 /
-                      ((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter)
+                      ((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter) {
+        av_log(NULL, AV_LOG_ERROR, "sws: filterSize %d is too large, try less extreem scaling or increase MAX_FILTER_SIZE and recompile\n", filterSize);
         goto fail;
+    }
     *outFilterSize = filterSize;
 
     if (flags & SWS_PRINT_INFO)
@@ -531,29 +558,27 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
     // FIXME try to align filterPos if possible
 
     // fix borders
-    if (is_horizontal) {
-        for (i = 0; i < dstW; i++) {
-            int j;
-            if ((*filterPos)[i] < 0) {
-                // move filter coefficients left to compensate for filterPos
-                for (j = 1; j < filterSize; j++) {
-                    int left = FFMAX(j + (*filterPos)[i], 0);
-                    filter[i * filterSize + left] += filter[i * filterSize + j];
-                    filter[i * filterSize + j]     = 0;
-                }
-                (*filterPos)[i] = 0;
+    for (i = 0; i < dstW; i++) {
+        int j;
+        if ((*filterPos)[i] < 0) {
+            // move filter coefficients left to compensate for filterPos
+            for (j = 1; j < filterSize; j++) {
+                int left = FFMAX(j + (*filterPos)[i], 0);
+                filter[i * filterSize + left] += filter[i * filterSize + j];
+                filter[i * filterSize + j]     = 0;
             }
+            (*filterPos)[i]= 0;
+        }
 
-            if ((*filterPos)[i] + filterSize > srcW) {
-                int shift = (*filterPos)[i] + filterSize - srcW;
-                // move filter coefficients right to compensate for filterPos
-                for (j = filterSize - 2; j >= 0; j--) {
-                    int right = FFMIN(j + shift, filterSize - 1);
-                    filter[i * filterSize + right] += filter[i * filterSize + j];
-                    filter[i * filterSize + j]      = 0;
-                }
-                (*filterPos)[i] = srcW - filterSize;
+        if ((*filterPos)[i] + filterSize > srcW) {
+            int shift = (*filterPos)[i] + filterSize - srcW;
+            // move filter coefficients right to compensate for filterPos
+            for (j = filterSize - 2; j >= 0; j--) {
+                int right = FFMIN(j + shift, filterSize - 1);
+                filter[i * filterSize + right] += filter[i * filterSize + j];
+                filter[i * filterSize + j]      = 0;
             }
+            (*filterPos)[i]= srcW - filterSize;
         }
     }
 
@@ -594,6 +619,8 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
     ret = 0;
 
 fail:
+    if(ret < 0)
+        av_log(NULL, AV_LOG_ERROR, "sws: initFilter failed\n");
     av_free(filter);
     av_free(filter2);
     return ret;
@@ -780,11 +807,17 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
     memcpy(c->srcColorspaceTable, inv_table, sizeof(int) * 4);
     memcpy(c->dstColorspaceTable, table, sizeof(int) * 4);
 
+    if(!isYUV(c->dstFormat) && !isGray(c->dstFormat))
+        dstRange = 0;
+    if(!isYUV(c->srcFormat) && !isGray(c->srcFormat))
+        srcRange = 0;
+
     c->brightness = brightness;
     c->contrast   = contrast;
     c->saturation = saturation;
     c->srcRange   = srcRange;
     c->dstRange   = dstRange;
+
     if (isYUV(c->dstFormat) || isGray(c->dstFormat))
         return -1;
 
@@ -805,7 +838,7 @@ int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
                              int *srcRange, int **table, int *dstRange,
                              int *brightness, int *contrast, int *saturation)
 {
-    if (isYUV(c->dstFormat) || isGray(c->dstFormat))
+    if (!c || isYUV(c->dstFormat) || isGray(c->dstFormat))
         return -1;
 
     *inv_table  = c->srcColorspaceTable;
@@ -839,6 +872,17 @@ static int handle_jpeg(enum AVPixelFormat *format)
     }
 }
 
+static int handle_0alpha(enum AVPixelFormat *format)
+{
+    switch (*format) {
+    case AV_PIX_FMT_0BGR    : *format = AV_PIX_FMT_ABGR   ; return 1;
+    case AV_PIX_FMT_BGR0    : *format = AV_PIX_FMT_BGRA   ; return 4;
+    case AV_PIX_FMT_0RGB    : *format = AV_PIX_FMT_ARGB   ; return 1;
+    case AV_PIX_FMT_RGB0    : *format = AV_PIX_FMT_RGBA   ; return 4;
+    default:                                          return 0;
+    }
+}
+
 SwsContext *sws_alloc_context(void)
 {
     SwsContext *c = av_mallocz(sizeof(SwsContext));
@@ -852,7 +896,7 @@ SwsContext *sws_alloc_context(void)
 av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                              SwsFilter *dstFilter)
 {
-    int i;
+    int i, j;
     int usesVFilter, usesHFilter;
     int unscaled;
     SwsFilter dummyFilter = { NULL, NULL, NULL, NULL };
@@ -860,8 +904,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     int srcH              = c->srcH;
     int dstW              = c->dstW;
     int dstH              = c->dstH;
-    int dst_stride        = FFALIGN(dstW * sizeof(int16_t) + 16, 16);
-    int dst_stride_px     = dst_stride >> 1;
+    int dst_stride        = FFALIGN(dstW * sizeof(int16_t) + 66, 16);
     int flags, cpu_flags;
     enum AVPixelFormat srcFormat = c->srcFormat;
     enum AVPixelFormat dstFormat = c->dstFormat;
@@ -876,14 +919,25 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
 
     unscaled = (srcW == dstW && srcH == dstH);
 
+    handle_jpeg(&srcFormat);
+    handle_jpeg(&dstFormat);
+    handle_0alpha(&srcFormat);
+    handle_0alpha(&dstFormat);
+
+    if(srcFormat!=c->srcFormat || dstFormat!=c->dstFormat){
+        av_log(c, AV_LOG_WARNING, "deprecated pixel format used, make sure you did set range correctly\n");
+        c->srcFormat= srcFormat;
+        c->dstFormat= dstFormat;
+    }
+
     if (!sws_isSupportedInput(srcFormat)) {
         av_log(c, AV_LOG_ERROR, "%s is not supported as input pixel format\n",
-               sws_format_name(srcFormat));
+               av_get_pix_fmt_name(srcFormat));
         return AVERROR(EINVAL);
     }
     if (!sws_isSupportedOutput(dstFormat)) {
         av_log(c, AV_LOG_ERROR, "%s is not supported as output pixel format\n",
-               sws_format_name(dstFormat));
+               av_get_pix_fmt_name(dstFormat));
         return AVERROR(EINVAL);
     }
 
@@ -899,12 +953,11 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                  SWS_SPLINE        |
                  SWS_BICUBLIN);
     if (!i || (i & (i - 1))) {
-        av_log(c, AV_LOG_ERROR,
-               "Exactly one scaler algorithm must be chosen\n");
+        av_log(c, AV_LOG_ERROR, "Exactly one scaler algorithm must be chosen, got %X\n", i);
         return AVERROR(EINVAL);
     }
     /* sanity check */
-    if (srcW < 4 || srcH < 1 || dstW < 8 || dstH < 1) {
+    if (srcW < 1 || srcH < 1 || dstW < 1 || dstH < 1) {
         /* FIXME check if these are enough and try to lower them after
          * fixing the relevant parts of the code */
         av_log(c, AV_LOG_ERROR, "%dx%d -> %dx%d is invalid scaling dimension\n",
@@ -935,6 +988,14 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
     getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
 
+
+    if (isAnyRGB(dstFormat) && !(flags&SWS_FULL_CHR_H_INT)) {
+        if (dstW&1) {
+            av_log(c, AV_LOG_DEBUG, "Forcing full internal H chroma due to odd output size\n");
+            flags |= SWS_FULL_CHR_H_INT;
+            c->flags = flags;
+        }
+    }
     /* reuse chroma for 2 pixels RGB/BGR unless user wants full
      * chroma interpolation */
     if (flags & SWS_FULL_CHR_H_INT &&
@@ -945,9 +1006,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
         dstFormat != AV_PIX_FMT_ABGR  &&
         dstFormat != AV_PIX_FMT_RGB24 &&
         dstFormat != AV_PIX_FMT_BGR24) {
-        av_log(c, AV_LOG_ERROR,
+        av_log(c, AV_LOG_WARNING,
                "full chroma interpolation for destination format '%s' not yet implemented\n",
-               sws_format_name(dstFormat));
+               av_get_pix_fmt_name(dstFormat));
         flags   &= ~SWS_FULL_CHR_H_INT;
         c->flags = flags;
     }
@@ -975,6 +1036,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     c->chrDstW = -((-dstW) >> c->chrDstHSubSample);
     c->chrDstH = -((-dstH) >> c->chrDstVSubSample);
 
+    FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail);
+
     /* unscaled special cases */
     if (unscaled && !usesHFilter && !usesVFilter &&
         (c->srcRange == c->dstRange || isAnyRGB(dstFormat))) {
@@ -984,7 +1047,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
             if (flags & SWS_PRINT_INFO)
                 av_log(c, AV_LOG_INFO,
                        "using unscaled %s -> %s special converter\n",
-                       sws_format_name(srcFormat), sws_format_name(dstFormat));
+                       av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
             return 0;
         }
     }
@@ -995,12 +1058,11 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     c->dstBpc = 1 + desc_dst->comp[0].depth_minus1;
     if (c->dstBpc < 8)
         c->dstBpc = 8;
+    if (isAnyRGB(srcFormat) || srcFormat == AV_PIX_FMT_PAL8)
+        c->srcBpc = 16;
     if (c->dstBpc == 16)
         dst_stride <<= 1;
-    FF_ALLOC_OR_GOTO(c, c->formatConvBuffer,
-                     (FFALIGN(srcW, 16) * 2 * FFALIGN(c->srcBpc, 8) >> 3) + 16,
-                     fail);
-    if (INLINE_MMXEXT(cpu_flags) && c->srcBpc == 8 && c->dstBpc <= 10) {
+    if (INLINE_MMXEXT(cpu_flags) && c->srcBpc == 8 && c->dstBpc <= 14) {
         c->canMMX2BeUsed = (dstW >= srcW && (dstW & 31) == 0 &&
                             (srcW & 15) == 0) ? 1 : 0;
         if (!c->canMMX2BeUsed && dstW >= srcW && (srcW & 15) == 0
@@ -1009,8 +1071,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                 av_log(c, AV_LOG_INFO,
                        "output width is not a multiple of 32 -> no MMXEXT scaler\n");
         }
-        if (usesHFilter)
-            c->canMMX2BeUsed = 0;
+        if (usesHFilter || isNBPS(c->srcFormat) || is16BPS(c->srcFormat) || isAnyRGB(c->srcFormat))
+            c->canMMX2BeUsed=0;
     } else
         c->canMMX2BeUsed = 0;
 
@@ -1030,7 +1092,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
             c->chrXInc += 20;
         }
         // we don't use the x86 asm scaler if MMX is available
-        else if (INLINE_MMX(cpu_flags)) {
+        else if (INLINE_MMX(cpu_flags) && c->dstBpc <= 14) {
             c->lumXInc = ((int64_t)(srcW       - 2) << 16) / (dstW       - 2) - 20;
             c->chrXInc = ((int64_t)(c->chrSrcW - 2) << 16) / (c->chrDstW - 2) - 20;
         }
@@ -1059,17 +1121,25 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
             c->chrMmx2FilterCode = av_malloc(c->chrMmx2FilterCodeSize);
 #endif
 
+#ifdef MAP_ANONYMOUS
+            if (c->lumMmx2FilterCode == MAP_FAILED || c->chrMmx2FilterCode == MAP_FAILED)
+#else
             if (!c->lumMmx2FilterCode || !c->chrMmx2FilterCode)
+#endif
+            {
+                av_log(c, AV_LOG_ERROR, "Failed to allocate MMX2FilterCode\n");
                 return AVERROR(ENOMEM);
+            }
+
             FF_ALLOCZ_OR_GOTO(c, c->hLumFilter,    (dstW           / 8 + 8) * sizeof(int16_t), fail);
             FF_ALLOCZ_OR_GOTO(c, c->hChrFilter,    (c->chrDstW     / 4 + 8) * sizeof(int16_t), fail);
             FF_ALLOCZ_OR_GOTO(c, c->hLumFilterPos, (dstW       / 2 / 8 + 8) * sizeof(int32_t), fail);
             FF_ALLOCZ_OR_GOTO(c, c->hChrFilterPos, (c->chrDstW / 2 / 4 + 8) * sizeof(int32_t), fail);
 
-            initMMX2HScaler(dstW, c->lumXInc, c->lumMmx2FilterCode,
-                            c->hLumFilter, c->hLumFilterPos, 8);
+            initMMX2HScaler(      dstW, c->lumXInc, c->lumMmx2FilterCode,
+                            c->hLumFilter, (uint32_t*)c->hLumFilterPos, 8);
             initMMX2HScaler(c->chrDstW, c->chrXInc, c->chrMmx2FilterCode,
-                            c->hChrFilter, c->hChrFilterPos, 4);
+                            c->hChrFilter, (uint32_t*)c->hChrFilterPos, 4);
 
 #if USE_MMAP
             mprotect(c->lumMmx2FilterCode, c->lumMmx2FilterCodeSize, PROT_EXEC | PROT_READ);
@@ -1088,14 +1158,14 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                            srcW, dstW, filterAlign, 1 << 14,
                            (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
                            cpu_flags, srcFilter->lumH, dstFilter->lumH,
-                           c->param, 1) < 0)
+                           c->param) < 0)
                 goto fail;
             if (initFilter(&c->hChrFilter, &c->hChrFilterPos,
                            &c->hChrFilterSize, c->chrXInc,
                            c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
                            (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
                            cpu_flags, srcFilter->chrH, dstFilter->chrH,
-                           c->param, 1) < 0)
+                           c->param) < 0)
                 goto fail;
         }
     } // initialize horizontal stuff
@@ -1111,14 +1181,14 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                        c->lumYInc, srcH, dstH, filterAlign, (1 << 12),
                        (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
                        cpu_flags, srcFilter->lumV, dstFilter->lumV,
-                       c->param, 0) < 0)
+                       c->param) < 0)
             goto fail;
         if (initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize,
                        c->chrYInc, c->chrSrcH, c->chrDstH,
                        filterAlign, (1 << 12),
                        (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
                        cpu_flags, srcFilter->chrV, dstFilter->chrV,
-                       c->param, 0) < 0)
+                       c->param) < 0)
             goto fail;
 
 #if HAVE_ALTIVEC
@@ -1175,9 +1245,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
                           dst_stride + 16, fail);
         c->lumPixBuf[i] = c->lumPixBuf[i + c->vLumBufSize];
     }
-    // 64 / (c->dstBpc & ~7) is the same as 16 / sizeof(scaling_intermediate)
-    c->uv_off_px   = dst_stride_px + 64 / (c->dstBpc & ~7);
-    c->uv_off_byte = dst_stride + 16;
+    // 64 / c->scalingBpp is the same as 16 / sizeof(scaling_intermediate)
+    c->uv_off   = (dst_stride>>1) + 64 / (c->dstBpc &~ 7);
+    c->uv_offx2 = dst_stride + 16;
     for (i = 0; i < c->vChrBufSize; i++) {
         FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i + c->vChrBufSize],
                          dst_stride * 2 + 32, fail);
@@ -1194,9 +1264,15 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
 
     // try to avoid drawing green stuff between the right end and the stride end
     for (i = 0; i < c->vChrBufSize; i++)
-        memset(c->chrUPixBuf[i], 64, dst_stride * 2 + 1);
+        if(desc_dst->comp[0].depth_minus1 == 15){
+            av_assert0(c->dstBpc > 14);
+            for(j=0; j<dst_stride/2+1; j++)
+                ((int32_t*)(c->chrUPixBuf[i]))[j] = 1<<18;
+        } else
+            for(j=0; j<dst_stride+1; j++)
+                ((int16_t*)(c->chrUPixBuf[i]))[j] = 1<<14;
 
-    assert(c->chrDstH <= dstH);
+    av_assert0(c->chrDstH <= dstH);
 
     if (flags & SWS_PRINT_INFO) {
         if (flags & SWS_FAST_BILINEAR)
@@ -1225,7 +1301,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
             av_log(c, AV_LOG_INFO, "ehh flags invalid?! ");
 
         av_log(c, AV_LOG_INFO, "from %s to %s%s ",
-               sws_format_name(srcFormat),
+               av_get_pix_fmt_name(srcFormat),
 #ifdef DITHER1XBPP
                dstFormat == AV_PIX_FMT_BGR555   || dstFormat == AV_PIX_FMT_BGR565   ||
                dstFormat == AV_PIX_FMT_RGB444BE || dstFormat == AV_PIX_FMT_RGB444LE ||
@@ -1234,7 +1310,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
 #else
                "",
 #endif
-               sws_format_name(dstFormat));
+               av_get_pix_fmt_name(dstFormat));
 
         if (INLINE_MMXEXT(cpu_flags))
             av_log(c, AV_LOG_INFO, "using MMXEXT\n");
@@ -1281,6 +1357,8 @@ SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
     c->dstH      = dstH;
     c->srcRange  = handle_jpeg(&srcFormat);
     c->dstRange  = handle_jpeg(&dstFormat);
+    c->src0Alpha = handle_0alpha(&srcFormat);
+    c->dst0Alpha = handle_0alpha(&dstFormat);
     c->srcFormat = srcFormat;
     c->dstFormat = dstFormat;
 
@@ -1365,7 +1443,12 @@ SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
 
 SwsVector *sws_allocVec(int length)
 {
-    SwsVector *vec = av_malloc(sizeof(SwsVector));
+    SwsVector *vec;
+
+    if(length <= 0 || length > INT_MAX/ sizeof(double))
+        return NULL;
+
+    vec = av_malloc(sizeof(SwsVector));
     if (!vec)
         return NULL;
     vec->length = length;
@@ -1380,7 +1463,12 @@ SwsVector *sws_getGaussianVec(double variance, double quality)
     const int length = (int)(variance * quality + 0.5) | 1;
     int i;
     double middle  = (length - 1) * 0.5;
-    SwsVector *vec = sws_allocVec(length);
+    SwsVector *vec;
+
+    if(variance < 0 || quality < 0)
+        return NULL;
+
+    vec = sws_allocVec(length);
 
     if (!vec)
         return NULL;
@@ -1669,7 +1757,7 @@ void sws_freeContext(SwsContext *c)
 #endif /* HAVE_MMX_INLINE */
 
     av_freep(&c->yuvTable);
-    av_free(c->formatConvBuffer);
+    av_freep(&c->formatConvBuffer);
 
     av_free(c);
 }
@@ -1708,10 +1796,12 @@ struct SwsContext *sws_getCachedContext(struct SwsContext *context, int srcW,
         context->srcW      = srcW;
         context->srcH      = srcH;
         context->srcRange  = handle_jpeg(&srcFormat);
+        context->src0Alpha = handle_0alpha(&srcFormat);
         context->srcFormat = srcFormat;
         context->dstW      = dstW;
         context->dstH      = dstH;
         context->dstRange  = handle_jpeg(&dstFormat);
+        context->dst0Alpha = handle_0alpha(&dstFormat);
         context->dstFormat = dstFormat;
         context->flags     = flags;
         context->param[0]  = param[0];
diff --git a/libswscale/version.h b/libswscale/version.h
index acbdf6b012..37dcc96572 100644
--- a/libswscale/version.h
+++ b/libswscale/version.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 
 #define LIBSWSCALE_VERSION_MAJOR 2
 #define LIBSWSCALE_VERSION_MINOR 1
-#define LIBSWSCALE_VERSION_MICRO 1
+#define LIBSWSCALE_VERSION_MICRO 101
 
 #define LIBSWSCALE_VERSION_INT  AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
                                                LIBSWSCALE_VERSION_MINOR, \
@@ -52,5 +52,8 @@
 #ifndef FF_API_SWS_CPU_CAPS
 #define FF_API_SWS_CPU_CAPS    (LIBSWSCALE_VERSION_MAJOR < 3)
 #endif
+#ifndef FF_API_SWS_FORMAT_NAME
+#define FF_API_SWS_FORMAT_NAME  (LIBSWSCALE_VERSION_MAJOR < 3)
+#endif
 
 #endif /* SWSCALE_VERSION_H */
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 5416d48a4c..7d219b458b 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -1,3 +1,5 @@
+$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+
 OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 
 MMX-OBJS                        += x86/rgb2rgb.o                        \
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index a8d5a5a187..327b9f0bc5 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -35,8 +35,8 @@ SECTION_RODATA
 %define GV 0xD0E3
 %define BV 0xF6E4
 
-rgb_Yrnd:        times 4 dd 0x84000        ;  16.5 << 15
-rgb_UVrnd:       times 4 dd 0x404000       ; 128.5 << 15
+rgb_Yrnd:        times 4 dd 0x80100        ;  16.5 << 15
+rgb_UVrnd:       times 4 dd 0x400100       ; 128.5 << 15
 bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
 bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
 rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
@@ -82,7 +82,7 @@ SECTION .text
 ; %1 = nr. of XMM registers
 ; %2 = rgb or bgr
 %macro RGB24_TO_Y_FN 2-3
-cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
+cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, u3
 %if mmsize == 8
     mova           m5, [%2_Ycoeff_12x4]
     mova           m6, [%2_Ycoeff_3x56]
@@ -114,6 +114,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
 %if ARCH_X86_64
     movsxd         wq, wd
 %endif
+    add            wq, wq
     add          dstq, wq
     neg            wq
 %if notcpuflag(ssse3)
@@ -157,12 +158,11 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
     paddd          m2, m3                 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
     paddd          m0, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
     paddd          m2, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
-    psrad          m0, 15
-    psrad          m2, 15
+    psrad          m0, 9
+    psrad          m2, 9
     packssdw       m0, m2                 ; (word) { Y[0-7] }
-    packuswb       m0, m0                 ; (byte) { Y[0-7] }
-    movh    [dstq+wq], m0
-    add            wq, mmsize / 2
+    mova    [dstq+wq], m0
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
@@ -171,7 +171,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
 ; %1 = nr. of XMM registers
 ; %2 = rgb or bgr
 %macro RGB24_TO_UV_FN 2-3
-cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
 %if ARCH_X86_64
     mova           m8, [%2_Ucoeff_12x4]
     mova           m9, [%2_Ucoeff_3x56]
@@ -202,10 +202,11 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
 %endif ; x86-32/64
 %endif ; cpuflag(ssse3)
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
+    add            wq, wq
     add         dstUq, wq
     add         dstVq, wq
     neg            wq
@@ -263,23 +264,20 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
     paddd          m2, m6                 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
     paddd          m1, m6                 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
     paddd          m4, m6                 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
-    psrad          m0, 15
-    psrad          m2, 15
-    psrad          m1, 15
-    psrad          m4, 15
+    psrad          m0, 9
+    psrad          m2, 9
+    psrad          m1, 9
+    psrad          m4, 9
     packssdw       m0, m1                 ; (word) { U[0-7] }
     packssdw       m2, m4                 ; (word) { V[0-7] }
 %if mmsize == 8
-    packuswb       m0, m0                 ; (byte) { U[0-3] }
-    packuswb       m2, m2                 ; (byte) { V[0-3] }
-    movh   [dstUq+wq], m0
-    movh   [dstVq+wq], m2
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %else ; mmsize == 16
-    packuswb       m0, m2                 ; (byte) { U[0-7], V[0-7] }
-    movh   [dstUq+wq], m0
-    movhps [dstVq+wq], m0
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %endif ; mmsize == 8/16
-    add            wq, mmsize / 2
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; ARCH_X86_64 && %0 == 3
@@ -305,13 +303,15 @@ RGB24_FUNCS 10, 12
 INIT_XMM ssse3
 RGB24_FUNCS 11, 13
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 RGB24_FUNCS 11, 13
+%endif
 
 ; %1 = nr. of XMM registers
 ; %2-5 = rgba, bgra, argb or abgr (in individual characters)
 %macro RGB32_TO_Y_FN 5-6
-cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3
     mova           m5, [rgba_Ycoeff_%2%4]
     mova           m6, [rgba_Ycoeff_%3%5]
 %if %0 == 6
@@ -322,6 +322,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
     movsxd         wq, wd
 %endif
     lea          srcq, [srcq+wq*4]
+    add            wq, wq
     add          dstq, wq
     neg            wq
     mova           m4, [rgb_Yrnd]
@@ -329,8 +330,8 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
     psrlw          m7, 8                  ; (word) { 0x00ff } x4
 .loop:
     ; FIXME check alignment and use mova
-    movu           m0, [srcq+wq*4+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
-    movu           m2, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
     DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
     pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
     pmaddwd        m0, m6                 ; (dword) { Gx*GY }[0-3]
@@ -340,12 +341,11 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
     paddd          m2, m4                 ; += rgb_Yrnd
     paddd          m0, m1                 ; (dword) { Y[0-3] }
     paddd          m2, m3                 ; (dword) { Y[4-7] }
-    psrad          m0, 15
-    psrad          m2, 15
+    psrad          m0, 9
+    psrad          m2, 9
     packssdw       m0, m2                 ; (word) { Y[0-7] }
-    packuswb       m0, m0                 ; (byte) { Y[0-7] }
-    movh    [dstq+wq], m0
-    add            wq, mmsize / 2
+    mova    [dstq+wq], m0
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; %0 == 3
@@ -354,7 +354,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
 ; %1 = nr. of XMM registers
 ; %2-5 = rgba, bgra, argb or abgr (in individual characters)
 %macro RGB32_TO_UV_FN 5-6
-cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
 %if ARCH_X86_64
     mova           m8, [rgba_Ucoeff_%2%4]
     mova           m9, [rgba_Ucoeff_%3%5]
@@ -375,21 +375,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
 %else ; ARCH_X86_64 && %0 == 6
 .body:
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
+    add            wq, wq
     add         dstUq, wq
     add         dstVq, wq
-    lea          srcq, [srcq+wq*4]
+    lea          srcq, [srcq+wq*2]
     neg            wq
     pcmpeqb        m7, m7
     psrlw          m7, 8                  ; (word) { 0x00ff } x4
     mova           m6, [rgb_UVrnd]
 .loop:
     ; FIXME check alignment and use mova
-    movu           m0, [srcq+wq*4+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
-    movu           m4, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
     DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
     pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
     pmaddwd        m2, m0, coeffV2        ; (dword) { Gx*GV }[0-3]
@@ -405,25 +406,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
     pmaddwd        m4, coeffU2            ; (dword) { Gx*GU }[4-7]
     paddd          m3, m6                 ; += rgb_UVrnd
     paddd          m5, m6                 ; += rgb_UVrnd
-    psrad          m0, 15
+    psrad          m0, 9
     paddd          m1, m3                 ; (dword) { V[4-7] }
     paddd          m4, m5                 ; (dword) { U[4-7] }
-    psrad          m2, 15
-    psrad          m4, 15
-    psrad          m1, 15
+    psrad          m2, 9
+    psrad          m4, 9
+    psrad          m1, 9
     packssdw       m0, m4                 ; (word) { U[0-7] }
     packssdw       m2, m1                 ; (word) { V[0-7] }
 %if mmsize == 8
-    packuswb       m0, m0                 ; (byte) { U[0-7] }
-    packuswb       m2, m2                 ; (byte) { V[0-7] }
-    movh   [dstUq+wq], m0
-    movh   [dstVq+wq], m2
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %else ; mmsize == 16
-    packuswb       m0, m2                 ; (byte) { U[0-7], V[0-7] }
-    movh   [dstUq+wq], m0
-    movhps [dstVq+wq], m0
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %endif ; mmsize == 8/16
-    add            wq, mmsize / 2
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; ARCH_X86_64 && %0 == 3
@@ -451,8 +449,10 @@ RGB32_FUNCS 0, 0
 INIT_XMM sse2
 RGB32_FUNCS 8, 12
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 RGB32_FUNCS 8, 12
+%endif
 
 ;-----------------------------------------------------------------------------
 ; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
@@ -489,7 +489,7 @@ RGB32_FUNCS 8, 12
 ;      will be the same (i.e. YUYV+AVX), and thus we don't need to
 ;      split the loop in an aligned and unaligned case
 %macro YUYV_TO_Y_FN 2-3
-cglobal %2ToY, 3, 3, %1, dst, src, w
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
 %if ARCH_X86_64
     movsxd         wq, wd
 %endif
@@ -559,11 +559,11 @@ cglobal %2ToY, 3, 3, %1, dst, src, w
 ;      will be the same (i.e. UYVY+AVX), and thus we don't need to
 ;      split the loop in an aligned and unaligned case
 %macro YUYV_TO_UV_FN 2-3
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
     add         dstUq, wq
     add         dstVq, wq
@@ -593,8 +593,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
 .loop_%1:
     mov%1          m0, [srcq+wq*2]        ; (byte) { U0, V0, U1, V1, ... }
     mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
-    pand           m2, m0, m4             ; (word) { U0, U1, ..., U7 }
-    pand           m3, m1, m4             ; (word) { U8, U9, ..., U15 }
+    pand           m2, m0, m5             ; (word) { U0, U1, ..., U7 }
+    pand           m3, m1, m5             ; (word) { U8, U9, ..., U15 }
     psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
     psrlw          m1, 8                  ; (word) { V8, V9, ..., V15 }
     packuswb       m2, m3                 ; (byte) { U0, ..., U15 }
@@ -614,11 +614,11 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
 ; %1 = nr. of XMM registers
 ; %2 = nv12 or nv21
 %macro NVXX_TO_UV_FN 2
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
     add         dstUq, wq
     add         dstVq, wq
@@ -626,8 +626,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
     test         srcq, 15
 %endif
     lea          srcq, [srcq+wq*2]
-    pcmpeqb        m4, m4                 ; (byte) { 0xff } x 16
-    psrlw          m4, 8                  ; (word) { 0x00ff } x 8
+    pcmpeqb        m5, m5                 ; (byte) { 0xff } x 16
+    psrlw          m5, 8                  ; (word) { 0x00ff } x 8
 %if mmsize == 16
     jnz .loop_u_start
     neg            wq
@@ -659,6 +659,7 @@ YUYV_TO_UV_FN 3, uyvy
 NVXX_TO_UV_FN 5, nv12
 NVXX_TO_UV_FN 5, nv21
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 ; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
 ; that's not faster in practice
@@ -666,3 +667,4 @@ YUYV_TO_UV_FN 3, yuyv
 YUYV_TO_UV_FN 3, uyvy, 1
 NVXX_TO_UV_FN 5, nv12
 NVXX_TO_UV_FN 5, nv21
+%endif
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index cf0dec3843..ec253ba5eb 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -266,10 +266,12 @@ yuv2planeX_fn  9,  7, 5
 yuv2planeX_fn 10,  7, 5
 yuv2planeX_fn 16,  8, 5
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 yuv2planeX_fn  8, 10, 7
 yuv2planeX_fn  9,  7, 5
 yuv2planeX_fn 10,  7, 5
+%endif
 
 ; %1=outout-bpc, %2=alignment (u/a)
 %macro yuv2plane1_mainloop 2
@@ -404,8 +406,10 @@ yuv2plane1_fn 16, 6, 3
 INIT_XMM sse4
 yuv2plane1_fn 16, 5, 3
 
+%if HAVE_AVX_EXTERNAL
 INIT_XMM avx
 yuv2plane1_fn  8, 5, 5
 yuv2plane1_fn  9, 5, 3
 yuv2plane1_fn 10, 5, 3
 yuv2plane1_fn 16, 5, 3
+%endif
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 486f436702..f7288d3e44 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -6,20 +6,20 @@
  * Written by Nick Kurshev.
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -72,6 +72,9 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
 DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
 DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
 DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_mid)    = 0x4200420042004200ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_hi)     = 0x0210021002100210ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;
 
 #define RGB2YUV_SHIFT 8
 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c
index 205b749244..d802ab4227 100644
--- a/libswscale/x86/rgb2rgb_template.c
+++ b/libswscale/x86/rgb2rgb_template.c
@@ -7,20 +7,20 @@
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  * lot of big-endian byte order fixes by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -127,14 +127,11 @@ static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int sr
             "movq       %%mm4, %%mm3    \n\t" \
             "psllq        $48, %%mm2    \n\t" \
             "psllq        $32, %%mm3    \n\t" \
-            "pand "MANGLE(mask24hh)", %%mm2\n\t" \
-            "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
             "por        %%mm2, %%mm0    \n\t" \
             "psrlq        $16, %%mm1    \n\t" \
             "psrlq        $32, %%mm4    \n\t" \
             "psllq        $16, %%mm5    \n\t" \
             "por        %%mm3, %%mm1    \n\t" \
-            "pand  "MANGLE(mask24hhhh)", %%mm5\n\t" \
             "por        %%mm5, %%mm4    \n\t" \
  \
             MOVNTQ"     %%mm0,   (%0)    \n\t" \
@@ -713,27 +710,6 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_s
     }
 }
 
-/*
-  I use less accurate approximation here by simply left-shifting the input
-  value and filling the low order bits with zeroes. This method improves PNG
-  compression but this scheme cannot reproduce white exactly, since it does
-  not generate an all-ones maximum value; the net effect is to darken the
-  image slightly.
-
-  The better method should be "left bit replication":
-
-   4 3 2 1 0
-   ---------
-   1 1 0 1 1
-
-   7 6 5 4 3  2 1 0
-   ----------------
-   1 1 0 1 1  1 1 0
-   |=======|  |===|
-       |      leftmost bits repeated to fill open bits
-       |
-   original bits
-*/
 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
 {
     const uint16_t *end;
@@ -752,9 +728,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $2, %%mm1    \n\t"
-            "psrlq         $7, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -782,9 +759,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $2, %%mm1    \n\t"
-            "psrlq         $7, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -830,9 +808,9 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x7C00)>>7;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
     }
 }
 
@@ -854,9 +832,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $3, %%mm1    \n\t"
-            "psrlq         $8, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -884,9 +864,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $3, %%mm1    \n\t"
-            "psrlq         $8, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -931,9 +913,9 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0xF800)>>8;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
     }
 }
 
@@ -976,11 +958,12 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $2, %%mm1    \n\t"
-            "psrlq         $7, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        %5, %%mm0    \n\t"
+            "pmulhw        %5, %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             PACK_RGB32
-            ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
+            ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
             :"memory");
         d += 16;
         s += 4;
@@ -990,9 +973,9 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x7C00)>>7;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
         *d++ = 255;
     }
 }
@@ -1017,11 +1000,13 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $3, %%mm1    \n\t"
-            "psrlq         $8, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        %5, %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             PACK_RGB32
-            ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
+            ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
             :"memory");
         d += 16;
         s += 4;
@@ -1031,9 +1016,9 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0xF800)>>8;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
         *d++ = 255;
     }
 }
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index 440a27b0ba..c6dafdebc3 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -407,11 +407,15 @@ SCALE_FUNC %1, %2, X, X8, 7, %3
 SCALE_FUNCS  8, 15, %1
 SCALE_FUNCS  9, 15, %2
 SCALE_FUNCS 10, 15, %2
+SCALE_FUNCS 12, 15, %2
+SCALE_FUNCS 14, 15, %2
 SCALE_FUNCS 16, 15, %3
 %endif ; !sse4
 SCALE_FUNCS  8, 19, %1
 SCALE_FUNCS  9, 19, %2
 SCALE_FUNCS 10, 19, %2
+SCALE_FUNCS 12, 19, %2
+SCALE_FUNCS 14, 19, %2
 SCALE_FUNCS 16, 19, %3
 %endmacro
 
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index fc74d97201..e82666bec9 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,6 +23,7 @@
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
@@ -70,6 +71,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset)  = 0x1010101010101010ULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
 
+
 //MMX versions
 #if HAVE_MMX_INLINE
 #undef RENAME
@@ -117,9 +119,9 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
         c->greenDither= ff_dither4[dstY&1];
     c->redDither= ff_dither8[(dstY+1)&1];
     if (dstY < dstH - 2) {
-        const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-        const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-        const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+        const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+        const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+        const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
         int i;
 
         if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
@@ -186,7 +188,7 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
                 *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
                 lumMmxFilter[4*i+2]=
                 lumMmxFilter[4*i+3]=
-                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
+                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
                     *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
                     alpMmxFilter[4*i+2]=
@@ -197,12 +199,73 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
                 *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
                 chrMmxFilter[4*i+2]=
                 chrMmxFilter[4*i+3]=
-                ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
+                ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
             }
         }
     }
 }
 
+#if HAVE_MMXEXT
+static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    if(((int)dest) & 15){
+        return yuv2yuvX_MMX2(filter, filterSize, src, dest, dstW, dither, offset);
+    }
+    if (offset) {
+        __asm__ volatile("movq       (%0), %%xmm3\n\t"
+                         "movdqa    %%xmm3, %%xmm4\n\t"
+                         "psrlq       $24, %%xmm3\n\t"
+                         "psllq       $40, %%xmm4\n\t"
+                         "por       %%xmm4, %%xmm3\n\t"
+                         :: "r"(dither)
+                         );
+    } else {
+        __asm__ volatile("movq       (%0), %%xmm3\n\t"
+                         :: "r"(dither)
+                         );
+    }
+    __asm__ volatile(
+        "pxor      %%xmm0, %%xmm0\n\t"
+        "punpcklbw %%xmm0, %%xmm3\n\t"
+        "psraw        $4, %%xmm3\n\t"
+        "movdqa    %%xmm3, %%xmm4\n\t"
+        "movdqa    %%xmm3, %%xmm7\n\t"
+        "movl %3, %%ecx\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        ".p2align                             4             \n\t" /* FIXME Unroll? */\
+        "1:                                                 \n\t"\
+        "movddup                  8(%%"REG_d"), %%xmm0      \n\t" /* filterCoeff */\
+        "movdqa              (%%"REG_S", %%"REG_c", 2), %%xmm2      \n\t" /* srcData */\
+        "movdqa            16(%%"REG_S", %%"REG_c", 2), %%xmm5      \n\t" /* srcData */\
+        "add                                $16, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "test                         %%"REG_S", %%"REG_S"  \n\t"\
+        "pmulhw                           %%xmm0, %%xmm2      \n\t"\
+        "pmulhw                           %%xmm0, %%xmm5      \n\t"\
+        "paddw                            %%xmm2, %%xmm3      \n\t"\
+        "paddw                            %%xmm5, %%xmm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%xmm3      \n\t"\
+        "psraw                               $3, %%xmm4      \n\t"\
+        "packuswb                         %%xmm4, %%xmm3      \n\t"
+        "movntdq                          %%xmm3, (%1, %%"REG_c")\n\t"
+        "add                         $16, %%"REG_c"         \n\t"\
+        "cmp                          %2, %%"REG_c"         \n\t"\
+        "movdqa    %%xmm7, %%xmm3\n\t"
+        "movdqa    %%xmm7, %%xmm4\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "jb                                  1b             \n\t"\
+        :: "g" (filter),
+           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+        : "%"REG_d, "%"REG_S, "%"REG_c
+    );
+}
+#endif
+
 #endif /* HAVE_INLINE_ASM */
 
 #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
@@ -216,10 +279,14 @@ extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt(
     SCALE_FUNC(filter_n,  8, 15, opt); \
     SCALE_FUNC(filter_n,  9, 15, opt); \
     SCALE_FUNC(filter_n, 10, 15, opt); \
+    SCALE_FUNC(filter_n, 12, 15, opt); \
+    SCALE_FUNC(filter_n, 14, 15, opt); \
     SCALE_FUNC(filter_n, 16, 15, opt); \
     SCALE_FUNC(filter_n,  8, 19, opt); \
     SCALE_FUNC(filter_n,  9, 19, opt); \
     SCALE_FUNC(filter_n, 10, 19, opt); \
+    SCALE_FUNC(filter_n, 12, 19, opt); \
+    SCALE_FUNC(filter_n, 14, 19, opt); \
     SCALE_FUNC(filter_n, 16, 19, opt)
 
 #define SCALE_FUNCS_MMX(opt) \
@@ -275,11 +342,14 @@ VSCALE_FUNCS(avx, avx);
 
 #define INPUT_Y_FUNC(fmt, opt) \
 extern void ff_ ## fmt ## ToY_  ## opt(uint8_t *dst, const uint8_t *src, \
+                                       const uint8_t *unused1, const uint8_t *unused2, \
                                        int w, uint32_t *unused)
 #define INPUT_UV_FUNC(fmt, opt) \
 extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
-                                       const uint8_t *src, const uint8_t *unused1, \
-                                       int w, uint32_t *unused2)
+                                       const uint8_t *unused0, \
+                                       const uint8_t *src1, \
+                                       const uint8_t *src2, \
+                                       int w, uint32_t *unused)
 #define INPUT_FUNC(fmt, opt) \
     INPUT_Y_FUNC(fmt, opt); \
     INPUT_UV_FUNC(fmt, opt)
@@ -312,21 +382,32 @@ av_cold void ff_sws_init_swScale_mmx(SwsContext *c)
 #if HAVE_MMXEXT_INLINE
     if (cpu_flags & AV_CPU_FLAG_MMXEXT)
         sws_init_swScale_MMX2(c);
+    if (cpu_flags & AV_CPU_FLAG_SSE3){
+        if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
+            c->yuv2planeX = yuv2yuvX_sse3;
+    }
 #endif
 #endif /* HAVE_INLINE_ASM */
 
 #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
     if (c->srcBpc == 8) { \
-        hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
                                      ff_hscale8to19_ ## filtersize ## _ ## opt1; \
     } else if (c->srcBpc == 9) { \
-        hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
                                      ff_hscale9to19_ ## filtersize ## _ ## opt1; \
     } else if (c->srcBpc == 10) { \
-        hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
                                      ff_hscale10to19_ ## filtersize ## _ ## opt1; \
-    } else /* c->srcBpc == 16 */ { \
-        hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
+    } else if (c->srcBpc == 12) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale12to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale14to19_ ## filtersize ## _ ## opt1; \
+    } else { /* c->srcBpc == 16 */ \
+        av_assert0(c->srcBpc == 16);\
+        hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
                                      ff_hscale16to19_ ## filtersize ## _ ## opt1; \
     } \
 } while (0)
@@ -341,14 +422,15 @@ switch(c->dstBpc){ \
     case 16:                          do_16_case;                          break; \
     case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
     case 9:  if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_  ## opt; break; \
-    default: if (condition_8bit)      vscalefn = ff_yuv2planeX_8_  ## opt; break; \
+    default: if (condition_8bit)    /*vscalefn = ff_yuv2planeX_8_  ## opt;*/ break; \
     }
 #define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
     switch(c->dstBpc){ \
     case 16: if (!isBE(c->dstFormat))            vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
     case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
     case 9:  if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_  ## opt2;  break; \
-    default:                                     vscalefn = ff_yuv2plane1_8_  ## opt1;  break; \
+    case 8:                                      vscalefn = ff_yuv2plane1_8_  ## opt1;  break; \
+    default: av_assert0(c->dstBpc>8); \
     }
 #define case_rgb(x, X, opt) \
         case AV_PIX_FMT_ ## X: \
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 8be6398f59..c61a63f53a 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -1,25 +1,26 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #undef REAL_MOVNTQ
 #undef MOVNTQ
+#undef MOVNTQ2
 #undef PREFETCH
 
 #if COMPILE_TEMPLATE_MMXEXT
@@ -30,11 +31,84 @@
 
 #if COMPILE_TEMPLATE_MMXEXT
 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movntq "
 #else
 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movq "
 #endif
 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
 
+#if !COMPILE_TEMPLATE_MMXEXT
+static av_always_inline void
+dither_8to16(const uint8_t *srcDither, int rot)
+{
+    if (rot) {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "psrlq       $24, %%mm3\n\t"
+                         "psllq       $40, %%mm4\n\t"
+                         "por       %%mm4, %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         :: "r"(srcDither)
+                         );
+    } else {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         :: "r"(srcDither)
+                         );
+    }
+}
+#endif
+
+static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    dither_8to16(dither, offset);
+    __asm__ volatile(\
+        "psraw        $4, %%mm3\n\t"
+        "psraw        $4, %%mm4\n\t"
+        "movq    %%mm3, %%mm6\n\t"
+        "movq    %%mm4, %%mm7\n\t"
+        "movl %3, %%ecx\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        ".p2align                             4             \n\t" /* FIXME Unroll? */\
+        "1:                                                 \n\t"\
+        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
+        "movq                (%%"REG_S", %%"REG_c", 2), %%mm2      \n\t" /* srcData */\
+        "movq               8(%%"REG_S", %%"REG_c", 2), %%mm5      \n\t" /* srcData */\
+        "add                                $16, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "test                         %%"REG_S", %%"REG_S"  \n\t"\
+        "pmulhw                           %%mm0, %%mm2      \n\t"\
+        "pmulhw                           %%mm0, %%mm5      \n\t"\
+        "paddw                            %%mm2, %%mm3      \n\t"\
+        "paddw                            %%mm5, %%mm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%mm3      \n\t"\
+        "psraw                               $3, %%mm4      \n\t"\
+        "packuswb                         %%mm4, %%mm3      \n\t"
+        MOVNTQ2 "                         %%mm3, (%1, %%"REG_c")\n\t"
+        "add                          $8, %%"REG_c"         \n\t"\
+        "cmp                          %2, %%"REG_c"         \n\t"\
+        "movq    %%mm6, %%mm3\n\t"
+        "movq    %%mm7, %%mm4\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "jb                                  1b             \n\t"\
+        :: "g" (filter),
+           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+        : "%"REG_d, "%"REG_S, "%"REG_c
+    );
+}
+
 #define YSCALEYUV2PACKEDX_UV \
     __asm__ volatile(\
         "xor                   %%"REG_a", %%"REG_a"     \n\t"\
@@ -260,7 +334,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
         YSCALEYUV2PACKEDX_ACCURATE
@@ -293,7 +367,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
         YSCALEYUV2PACKEDX
@@ -350,7 +424,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -374,7 +448,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -427,7 +501,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -451,7 +525,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -584,7 +658,7 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -608,7 +682,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -649,7 +723,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -670,7 +744,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -786,8 +860,8 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
             : "%r8"
         );
 #else
-        *(const uint16_t **)(&c->u_temp)=abuf0;
-        *(const uint16_t **)(&c->v_temp)=abuf1;
+        c->u_temp=(intptr_t)abuf0;
+        c->v_temp=(intptr_t)abuf1;
         __asm__ volatile(
             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
             "mov        %4, %%"REG_b"               \n\t"
@@ -1559,9 +1633,9 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
 {
     enum AVPixelFormat dstFormat = c->dstFormat;
 
-    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
-        dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21) {
-        if (!(c->flags & SWS_BITEXACT)) {
+    c->use_mmx_vfilter= 0;
+    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
+        && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
             if (c->flags & SWS_ACCURATE_RND) {
                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
                     switch (c->dstFormat) {
@@ -1574,6 +1648,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
                     }
                 }
             } else {
+                c->use_mmx_vfilter= 1;
+                c->yuv2planeX = RENAME(yuv2yuvX    );
                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
                     switch (c->dstFormat) {
                     case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
@@ -1585,7 +1661,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
                     }
                 }
             }
-        }
         if (!(c->flags & SWS_FULL_CHR_H_INT)) {
             switch (c->dstFormat) {
             case AV_PIX_FMT_RGB32:
@@ -1614,7 +1689,7 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
         }
     }
 
-    if (c->srcBpc == 8 && c->dstBpc <= 10) {
+    if (c->srcBpc == 8 && c->dstBpc <= 14) {
     // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
 #if COMPILE_TEMPLATE_MMXEXT
     if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
index 17ac3e2ffe..ea52e3c71b 100644
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@@ -7,20 +7,20 @@
  * 1,4,8bpp support and context / deglobalize stuff
  * by Michael Niedermayer (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -74,10 +74,6 @@ av_cold SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
 #if HAVE_INLINE_ASM
     int cpu_flags = av_get_cpu_flags();
 
-    if (c->srcFormat != AV_PIX_FMT_YUV420P &&
-        c->srcFormat != AV_PIX_FMT_YUVA420P)
-        return NULL;
-
 #if HAVE_MMXEXT_INLINE
     if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
         switch (c->dstFormat) {
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index b028e93152..c879102cc4 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2001-2007 Michael Niedermayer
  *           (c) 2010 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,17 +43,14 @@
     if (h_size * depth > FFABS(dstStride[0]))                        \
         h_size -= 8;                                                 \
                                                                      \
-    if (c->srcFormat == AV_PIX_FMT_YUV422P) {                           \
-        srcStride[1] *= 2;                                           \
-        srcStride[2] *= 2;                                           \
-    }                                                                \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                        \
                                                                      \
     __asm__ volatile ("pxor %mm4, %mm4\n\t");                        \
     for (y = 0; y < srcSliceH; y++) {                                \
         uint8_t *image    = dst[0] + (y + srcSliceY) * dstStride[0]; \
         const uint8_t *py = src[0] +               y * srcStride[0]; \
-        const uint8_t *pu = src[1] +        (y >> 1) * srcStride[1]; \
-        const uint8_t *pv = src[2] +        (y >> 1) * srcStride[2]; \
+        const uint8_t *pu = src[1] +   (y >> vshift) * srcStride[1]; \
+        const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
         x86_reg index = -h_size / 2;                                 \
 
 #define YUV2RGB_INITIAL_LOAD          \
@@ -141,6 +138,7 @@
         : "+r" (index), "+r" (image)                              \
         : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
           "r" (py - 2*index)                                      \
+        : "memory"                                                \
         );                                                        \
     }                                                             \
 
@@ -148,6 +146,7 @@
         : "+r" (index), "+r" (image)                              \
         : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
           "r" (py - 2*index), "r" (pa - 2*index)                  \
+        : "memory"                                                \
         );                                                        \
     }                                                             \
 
@@ -188,7 +187,7 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(2)
 
@@ -216,7 +215,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(2)
 
@@ -306,7 +305,7 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(3)
 
@@ -324,7 +323,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(3)
 
@@ -368,7 +367,7 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
@@ -389,7 +388,7 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
                                         int srcSliceY, int srcSliceH,
                                         uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
@@ -411,7 +410,7 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
@@ -432,7 +431,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
                                         int srcSliceY, int srcSliceH,
                                         uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 1dbd0d8e10..53b69d0f73 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -6,27 +6,26 @@
  * 1,4,8bpp support and context / deglobalize stuff
  * by Michael Niedermayer (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <inttypes.h>
-#include <assert.h>
 
 #include "libavutil/cpu.h"
 #include "libavutil/bswap.h"
@@ -34,11 +33,14 @@
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"
+#include "libavutil/pixdesc.h"
 
-extern const uint8_t dither_4x4_16[4][8];
-extern const uint8_t dither_8x8_32[8][8];
-extern const uint8_t dither_8x8_73[8][8];
-extern const uint8_t dither_8x8_220[8][8];
+extern const uint8_t dither_2x2_4[3][8];
+extern const uint8_t dither_2x2_8[3][8];
+extern const uint8_t dither_4x4_16[5][8];
+extern const uint8_t dither_8x8_32[9][8];
+extern const uint8_t dither_8x8_73[9][8];
+extern const uint8_t dither_8x8_220[9][8];
 
 const int32_t ff_yuv2rgb_coeffs[8][4] = {
     { 117504, 138453, 13954, 34903 }, /* no sequence_display_extension */
@@ -61,9 +63,9 @@ const int *sws_getCoefficients(int colorspace)
 #define LOADCHROMA(i)                               \
     U = pu[i];                                      \
     V = pv[i];                                      \
-    r = (void *)c->table_rV[V];                     \
-    g = (void *)(c->table_gU[U] + c->table_gV[V]);  \
-    b = (void *)c->table_bU[U];
+    r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM];                     \
+    g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]);  \
+    b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
 
 #define PUTRGB(dst, src, i)                         \
     Y              = src[2 * i];                    \
@@ -387,24 +389,65 @@ ENDYUV2RGBLINE(24, 1)
     PUTBGR24(dst_2, py_2, 0);
 ENDYUV2RGBFUNC()
 
-// This is exactly the same code as yuv2rgb_c_32 except for the types of
-// r, g, b, dst_1, dst_2
-YUV2RGBFUNC(yuv2rgb_c_16, uint16_t, 0)
+YUV2RGBFUNC(yuv2rgb_c_16_ordered_dither, uint16_t, 0)
+    const uint8_t *d16 = dither_2x2_8[y & 1];
+    const uint8_t *e16 = dither_2x2_4[y & 1];
+    const uint8_t *f16 = dither_2x2_8[(y & 1)^1];
+
+#define PUTRGB16(dst, src, i, o)                    \
+    Y              = src[2 * i];                    \
+    dst[2 * i]     = r[Y + d16[0 + o]] +            \
+                     g[Y + e16[0 + o]] +            \
+                     b[Y + f16[0 + o]];             \
+    Y              = src[2 * i + 1];                \
+    dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
+                     g[Y + e16[1 + o]] +            \
+                     b[Y + f16[1 + o]];
     LOADCHROMA(0);
-    PUTRGB(dst_1, py_1, 0);
-    PUTRGB(dst_2, py_2, 0);
+    PUTRGB16(dst_1, py_1, 0, 0);
+    PUTRGB16(dst_2, py_2, 0, 0 + 8);
 
     LOADCHROMA(1);
-    PUTRGB(dst_2, py_2, 1);
-    PUTRGB(dst_1, py_1, 1);
+    PUTRGB16(dst_2, py_2, 1, 2 + 8);
+    PUTRGB16(dst_1, py_1, 1, 2);
 
     LOADCHROMA(2);
-    PUTRGB(dst_1, py_1, 2);
-    PUTRGB(dst_2, py_2, 2);
+    PUTRGB16(dst_1, py_1, 2, 4);
+    PUTRGB16(dst_2, py_2, 2, 4 + 8);
 
     LOADCHROMA(3);
-    PUTRGB(dst_2, py_2, 3);
-    PUTRGB(dst_1, py_1, 3);
+    PUTRGB16(dst_2, py_2, 3, 6 + 8);
+    PUTRGB16(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+YUV2RGBFUNC(yuv2rgb_c_15_ordered_dither, uint16_t, 0)
+    const uint8_t *d16 = dither_2x2_8[y & 1];
+    const uint8_t *e16 = dither_2x2_8[(y & 1)^1];
+
+#define PUTRGB15(dst, src, i, o)                    \
+    Y              = src[2 * i];                    \
+    dst[2 * i]     = r[Y + d16[0 + o]] +            \
+                     g[Y + d16[1 + o]] +            \
+                     b[Y + e16[0 + o]];             \
+    Y              = src[2 * i + 1];                \
+    dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
+                     g[Y + d16[0 + o]] +            \
+                     b[Y + e16[1 + o]];
+    LOADCHROMA(0);
+    PUTRGB15(dst_1, py_1, 0, 0);
+    PUTRGB15(dst_2, py_2, 0, 0 + 8);
+
+    LOADCHROMA(1);
+    PUTRGB15(dst_2, py_2, 1, 2 + 8);
+    PUTRGB15(dst_1, py_1, 1, 2);
+
+    LOADCHROMA(2);
+    PUTRGB15(dst_1, py_1, 2, 4);
+    PUTRGB15(dst_2, py_2, 2, 4 + 8);
+
+    LOADCHROMA(3);
+    PUTRGB15(dst_2, py_2, 3, 6 + 8);
+    PUTRGB15(dst_1, py_1, 3, 6);
 CLOSEYUV2RGBFUNC(8)
 
 // r, g, b, dst_1, dst_2
@@ -537,7 +580,7 @@ CLOSEYUV2RGBFUNC(8)
 YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
     const uint8_t *d128 = dither_8x8_220[y & 7];
     char out_1 = 0, out_2 = 0;
-    g = c->table_gU[128] + c->table_gV[128];
+    g = c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM];
 
 #define PUTRGB1(out, src, i, o)                     \
     Y    = src[2 * i];                              \
@@ -579,7 +622,7 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
 
     av_log(c, AV_LOG_WARNING,
            "No accelerated colorspace conversion found from %s to %s.\n",
-           sws_format_name(c->srcFormat), sws_format_name(c->dstFormat));
+           av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
 
     switch (c->dstFormat) {
     case AV_PIX_FMT_BGR48BE:
@@ -590,23 +633,21 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
         return yuv2rgb_c_48;
     case AV_PIX_FMT_ARGB:
     case AV_PIX_FMT_ABGR:
-        if (CONFIG_SWSCALE_ALPHA && c->srcFormat == AV_PIX_FMT_YUVA420P)
+        if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat))
             return yuva2argb_c;
     case AV_PIX_FMT_RGBA:
     case AV_PIX_FMT_BGRA:
-        if (CONFIG_SWSCALE_ALPHA && c->srcFormat == AV_PIX_FMT_YUVA420P)
-            return yuva2rgba_c;
-        else
-            return yuv2rgb_c_32;
+        return (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) ? yuva2rgba_c : yuv2rgb_c_32;
     case AV_PIX_FMT_RGB24:
         return yuv2rgb_c_24_rgb;
     case AV_PIX_FMT_BGR24:
         return yuv2rgb_c_24_bgr;
     case AV_PIX_FMT_RGB565:
     case AV_PIX_FMT_BGR565:
+        return yuv2rgb_c_16_ordered_dither;
     case AV_PIX_FMT_RGB555:
     case AV_PIX_FMT_BGR555:
-        return yuv2rgb_c_16;
+        return yuv2rgb_c_15_ordered_dither;
     case AV_PIX_FMT_RGB444:
     case AV_PIX_FMT_BGR444:
         return yuv2rgb_c_12_ordered_dither;
@@ -621,36 +662,32 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
         return yuv2rgb_c_4b_ordered_dither;
     case AV_PIX_FMT_MONOBLACK:
         return yuv2rgb_c_1_ordered_dither;
-    default:
-        assert(0);
     }
     return NULL;
 }
 
-static void fill_table(uint8_t *table[256], const int elemsize,
-                       const int inc, void *y_tab)
+static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize,
+                       const int64_t inc, void *y_tab)
 {
     int i;
-    int64_t cb       = 0;
     uint8_t *y_table = y_tab;
 
     y_table -= elemsize * (inc >> 9);
 
-    for (i = 0; i < 256; i++) {
+    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
+        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
         table[i] = y_table + elemsize * (cb >> 16);
-        cb      += inc;
     }
 }
 
-static void fill_gv_table(int table[256], const int elemsize, const int inc)
+static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int64_t inc)
 {
     int i;
-    int64_t cb = 0;
     int off    = -(inc >> 9);
 
-    for (i = 0; i < 256; i++) {
+    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
+        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
         table[i] = elemsize * (off + (cb >> 16));
-        cb      += inc;
     }
 }
 
@@ -693,7 +730,7 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
     uint8_t *y_table;
     uint16_t *y_table16;
     uint32_t *y_table32;
-    int i, base, rbase, gbase, bbase, abase, needAlpha;
+    int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha;
     const int yoffs = fullRange ? 384 : 326;
 
     int64_t crv =  inv_table[0];