34 files changed, 1381 insertions, 938 deletions
diff --git a/libswscale/Makefile b/libswscale/Makefile
index 0799b458be..dd00f7d708 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -1,3 +1,5 @@
+include $(SUBDIR)../config.mak
+
 NAME = swscale
 FFLIBS = avutil
 
diff --git a/libswscale/bfin/internal_bfin.S b/libswscale/bfin/internal_bfin.S
index b007f07f53..eab30aa6ce 100644
--- a/libswscale/bfin/internal_bfin.S
+++ b/libswscale/bfin/internal_bfin.S
@@ -5,20 +5,20 @@
  * Blackfin video color space converter operations
  * convert I420 YV12 to RGB in various formats
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/bfin/swscale_bfin.c b/libswscale/bfin/swscale_bfin.c
index f2fe871fc8..a51b2af56c 100644
--- a/libswscale/bfin/swscale_bfin.c
+++ b/libswscale/bfin/swscale_bfin.c
@@ -3,20 +3,20 @@
  *
  * Blackfin software video scaler operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/bfin/yuv2rgb_bfin.c b/libswscale/bfin/yuv2rgb_bfin.c
index 4078a18660..dae7f314e7 100644
--- a/libswscale/bfin/yuv2rgb_bfin.c
+++ b/libswscale/bfin/yuv2rgb_bfin.c
@@ -4,23 +4,24 @@
  * Blackfin video color space converter operations
  * convert I420 YV12 to RGB in various formats
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavutil/pixdesc.h"
 #include <stdint.h>
 
 #include "config.h"
@@ -195,7 +196,7 @@ SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c)
     }
 
     av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
-           sws_format_name(c->dstFormat));
+           av_get_pix_fmt_name(c->dstFormat));
 
     return f;
 }
diff --git a/libswscale/colorspace-test.c b/libswscale/colorspace-test.c
index 135924c1ab..440b5d369a 100644
--- a/libswscale/colorspace-test.c
+++ b/libswscale/colorspace-test.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,7 +33,7 @@
 
 #define FUNC(s, d, n) { s, d, #n, n }
 
-int main(void)
+int main(int argc, char **argv)
 {
     int i, funcNum;
     uint8_t *srcBuffer = av_malloc(SIZE);
@@ -54,6 +54,7 @@ int main(void)
             const char *name;
             void (*func)(const uint8_t *src, uint8_t *dst, int src_size);
         } func_info[] = {
+            FUNC(2, 2, rgb12to15),
             FUNC(2, 2, rgb15to16),
             FUNC(2, 3, rgb15to24),
             FUNC(2, 4, rgb15to32),
@@ -66,6 +67,7 @@ int main(void)
             FUNC(4, 2, rgb32to16),
             FUNC(4, 3, rgb32to24),
             FUNC(2, 2, rgb16to15),
+            FUNC(2, 2, rgb12tobgr12),
             FUNC(2, 2, rgb15tobgr15),
             FUNC(2, 2, rgb15tobgr16),
             FUNC(2, 3, rgb15tobgr24),
@@ -82,6 +84,8 @@ int main(void)
             FUNC(4, 2, rgb32tobgr16),
             FUNC(4, 3, rgb32tobgr24),
             FUNC(4, 4, shuffle_bytes_2103), /* rgb32tobgr32 */
+            FUNC(6, 6, rgb48tobgr48_LL),
+            FUNC(6, 6, rgb48tobgr48_LB),
             FUNC(0, 0, NULL)
         };
         int width;
diff --git a/libswscale/input.c b/libswscale/input.c
index 40ed122427..c9c91d0bca 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -51,6 +51,86 @@
 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
 
+static av_always_inline void
+rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
+                    enum PixelFormat origin)
+{
+    int i;
+    for (i = 0; i < width; i++) {
+        unsigned int r_b = input_pixel(&src[i*4+0]);
+        unsigned int   g = input_pixel(&src[i*4+1]);
+        unsigned int b_r = input_pixel(&src[i*4+2]);
+
+        dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void
+rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
+                    const uint16_t *src1, const uint16_t *src2,
+                    int width, enum PixelFormat origin)
+{
+    int i;
+    assert(src1==src2);
+    for (i = 0; i < width; i++) {
+        int r_b = input_pixel(&src1[i*4+0]);
+        int   g = input_pixel(&src1[i*4+1]);
+        int b_r = input_pixel(&src1[i*4+2]);
+
+        dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void
+rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
+                          const uint16_t *src1, const uint16_t *src2,
+                          int width, enum PixelFormat origin)
+{
+    int i;
+    assert(src1==src2);
+    for (i = 0; i < width; i++) {
+        int r_b = (input_pixel(&src1[8 * i + 0]) + input_pixel(&src1[8 * i + 4]) + 1) >> 1;
+        int   g = (input_pixel(&src1[8 * i + 1]) + input_pixel(&src1[8 * i + 5]) + 1) >> 1;
+        int b_r = (input_pixel(&src1[8 * i + 2]) + input_pixel(&src1[8 * i + 6]) + 1) >> 1;
+
+        dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+#define rgb64funcs(pattern, BE_LE, origin) \
+static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
+                                    int width, uint32_t *unused) \
+{ \
+    const uint16_t *src = (const uint16_t *) _src; \
+    uint16_t *dst = (uint16_t *) _dst; \
+    rgb64ToY_c_template(dst, src, width, origin); \
+} \
+ \
+static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
+                                    const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
+                                    int width, uint32_t *unused) \
+{ \
+    const uint16_t *src1 = (const uint16_t *) _src1, \
+                   *src2 = (const uint16_t *) _src2; \
+    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+    rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
+} \
+ \
+static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
+                                    const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
+                                    int width, uint32_t *unused) \
+{ \
+    const uint16_t *src1 = (const uint16_t *) _src1, \
+                   *src2 = (const uint16_t *) _src2; \
+    uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+    rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
+}
+
+rgb64funcs(rgb, LE, PIX_FMT_RGBA64LE)
+rgb64funcs(rgb, BE, PIX_FMT_RGBA64BE)
+
 static av_always_inline void rgb48ToY_c_template(uint16_t *dst,
                                                  const uint16_t *src, int width,
                                                  enum PixelFormat origin)
@@ -113,6 +193,7 @@ static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
 #define rgb48funcs(pattern, BE_LE, origin)                              \
 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst,              \
                                             const uint8_t *_src,        \
+                                            const uint8_t *unused0, const uint8_t *unused1,\
                                             int width,                  \
                                             uint32_t *unused)           \
 {                                                                       \
@@ -123,6 +204,7 @@ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst,              \
                                                                         \
 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU,            \
                                              uint8_t *_dstV,            \
+                                             const uint8_t *unused0,    \
                                              const uint8_t *_src1,      \
                                              const uint8_t *_src2,      \
                                              int width,                 \
@@ -137,6 +219,7 @@ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU,            \
                                                                         \
 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU,       \
                                                   uint8_t *_dstV,       \
+                                                  const uint8_t *unused0,    \
                                                   const uint8_t *_src1, \
                                                   const uint8_t *_src2, \
                                                   int width,            \
@@ -162,7 +245,7 @@ rgb48funcs(bgr, BE, PIX_FMT_BGR48BE)
                         : (isBE(origin) ? AV_RB16(&src[(i) * 2])        \
                                         : AV_RL16(&src[(i) * 2])))
 
-static av_always_inline void rgb16_32ToY_c_template(uint8_t *dst,
+static av_always_inline void rgb16_32ToY_c_template(int16_t *dst,
                                                     const uint8_t *src,
                                                     int width,
                                                     enum PixelFormat origin,
@@ -173,7 +256,7 @@ static av_always_inline void rgb16_32ToY_c_template(uint8_t *dst,
                                                     int gsh, int bsh, int S)
 {
     const int ry       = RY << rsh, gy = GY << gsh, by = BY << bsh;
-    const unsigned rnd = 33u << (S - 1);
+    const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
     int i;
 
     for (i = 0; i < width; i++) {
@@ -182,12 +265,12 @@ static av_always_inline void rgb16_32ToY_c_template(uint8_t *dst,
         int g  = (px & maskg) >> shg;
         int r  = (px & maskr) >> shr;
 
-        dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
+        dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
     }
 }
 
-static av_always_inline void rgb16_32ToUV_c_template(uint8_t *dstU,
-                                                     uint8_t *dstV,
+static av_always_inline void rgb16_32ToUV_c_template(int16_t *dstU,
+                                                     int16_t *dstV,
                                                      const uint8_t *src,
                                                      int width,
                                                      enum PixelFormat origin,
@@ -199,7 +282,7 @@ static av_always_inline void rgb16_32ToUV_c_template(uint8_t *dstU,
 {
     const int ru       = RU << rsh, gu = GU << gsh, bu = BU << bsh,
               rv       = RV << rsh, gv = GV << gsh, bv = BV << bsh;
-    const unsigned rnd = 257u << (S - 1);
+    const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
     int i;
 
     for (i = 0; i < width; i++) {
@@ -208,13 +291,13 @@ static av_always_inline void rgb16_32ToUV_c_template(uint8_t *dstU,
         int g  = (px & maskg)   >> shg;
         int r  = (px & maskr)   >> shr;
 
-        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
-        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
+        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
+        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
     }
 }
 
-static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU,
-                                                          uint8_t *dstV,
+static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
+                                                          int16_t *dstV,
                                                           const uint8_t *src,
                                                           int width,
                                                           enum PixelFormat origin,
@@ -227,7 +310,7 @@ static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU,
     const int ru       = RU << rsh, gu = GU << gsh, bu = BU << bsh,
               rv       = RV << rsh, gv = GV << gsh, bv = BV << bsh,
               maskgx   = ~(maskr | maskb);
-    const unsigned rnd = 257u << S;
+    const unsigned rnd = (256U<<(S)) + (1<<(S-6));
     int i;
 
     maskr |= maskr << 1;
@@ -249,8 +332,8 @@ static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU,
         }
         r = (rb & maskr) >> shr;
 
-        dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
-        dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
+        dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
+        dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
     }
 }
 
@@ -258,28 +341,28 @@ static av_always_inline void rgb16_32ToUV_half_c_template(uint8_t *dstU,
 
 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr,          \
                          maskg, maskb, rsh, gsh, bsh, S)                \
-static void name ## ToY_c(uint8_t *dst, const uint8_t *src,             \
+static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,            \
                           int width, uint32_t *unused)                  \
 {                                                                       \
-    rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp,    \
+    rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, shp,    \
                            maskr, maskg, maskb, rsh, gsh, bsh, S);      \
 }                                                                       \
                                                                         \
 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV,                \
-                           const uint8_t *src, const uint8_t *dummy,    \
+                           const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy,    \
                            int width, uint32_t *unused)                 \
 {                                                                       \
-    rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt,                \
+    rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,                \
                             shr, shg, shb, shp,                         \
                             maskr, maskg, maskb, rsh, gsh, bsh, S);     \
 }                                                                       \
                                                                         \
 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV,           \
-                                const uint8_t *src,                     \
+                                const uint8_t *unused0, const uint8_t *src,                     \
                                 const uint8_t *dummy,                   \
                                 int width, uint32_t *unused)            \
 {                                                                       \
-    rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt,           \
+    rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,           \
                                  shr, shg, shb, shp,                    \
                                  maskr, maskg, maskb,                   \
                                  rsh, gsh, bsh, S);                     \
@@ -302,34 +385,67 @@ rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x0
 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT + 7)
 rgb16_32_wrapper(PIX_FMT_RGB444BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT + 4)
 
-static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width,
-                      uint32_t *unused)
+static void gbr24pToUV_half_c(uint16_t *dstU, uint16_t *dstV,
+                         const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
+                         int width, enum PixelFormat origin)
 {
     int i;
-    for (i = 0; i < width; i++)
-        dst[i] = src[4 * i];
+    for (i = 0; i < width; i++) {
+        unsigned int g   = gsrc[2*i] + gsrc[2*i+1];
+        unsigned int b   = bsrc[2*i] + bsrc[2*i+1];
+        unsigned int r   = rsrc[2*i] + rsrc[2*i+1];
+
+        dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+        dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+    }
 }
 
-static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width,
-                      uint32_t *unused)
+static void rgba64ToA_c(int16_t *dst, const uint16_t *src, const uint8_t *unused1,
+                        const uint8_t *unused2, int width, uint32_t *unused)
 {
     int i;
     for (i = 0; i < width; i++)
         dst[i] = src[4 * i + 3];
 }
 
-static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
+static void abgrToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i]= src[4*i]<<6;
+    }
+}
+
+static void rgbaToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i]= src[4*i+3]<<6;
+    }
+}
+
+static void palToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        int d= src[i];
+
+        dst[i]= (pal[d] >> 24)<<6;
+    }
+}
+
+static void palToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, long width, uint32_t *pal)
 {
     int i;
     for (i = 0; i < width; i++) {
         int d = src[i];
 
-        dst[i] = pal[d] & 0xFF;
+        dst[i] = (pal[d] & 0xFF)<<6;
     }
 }
 
-static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
-                      const uint8_t *src1, const uint8_t *src2,
+static void palToUV_c(uint16_t *dstU, int16_t *dstV,
+                           const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
                       int width, uint32_t *pal)
 {
     int i;
@@ -337,36 +453,44 @@ static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
     for (i = 0; i < width; i++) {
         int p = pal[src1[i]];
 
-        dstU[i] = p >> 8;
-        dstV[i] = p >> 16;
+        dstU[i] = (uint8_t)(p>> 8)<<6;
+        dstV[i] = (uint8_t)(p>>16)<<6;
     }
 }
 
-static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
-                          int width, uint32_t *unused)
+static void monowhite2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
 {
     int i, j;
     width = (width + 7) >> 3;
     for (i = 0; i < width; i++) {
         int d = ~src[i];
         for (j = 0; j < 8; j++)
-            dst[8 * i + j] = ((d >> (7 - j)) & 1) * 255;
+            dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+    }
+    if(width&7){
+        int d= ~src[i];
+        for (j = 0; j < (width&7); j++)
+            dst[8*i+j]= ((d>>(7-j))&1) * 16383;
     }
 }
 
-static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
-                          int width, uint32_t *unused)
+static void monoblack2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
 {
     int i, j;
     width = (width + 7) >> 3;
     for (i = 0; i < width; i++) {
         int d = src[i];
         for (j = 0; j < 8; j++)
-            dst[8 * i + j] = ((d >> (7 - j)) & 1) * 255;
+            dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+    }
+    if(width&7){
+        int d = src[i];
+        for (j = 0; j < (width&7); j++)
+            dst[8*i+j] = ((d>>(7-j))&1) * 16383;
     }
 }
 
-static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
+static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
                       uint32_t *unused)
 {
     int i;
@@ -374,7 +498,7 @@ static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
         dst[i] = src[2 * i];
 }
 
-static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                        const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -385,7 +509,7 @@ static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
     assert(src1 == src2);
 }
 
-static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width,
+static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2,  int width,
                        uint32_t *unused)
 {
     int i;
@@ -395,7 +519,7 @@ static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width,
         dst[i] = av_bswap16(src[i]);
 }
 
-static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
+static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
                         const uint8_t *_src2, int width, uint32_t *unused)
 {
     int i;
@@ -410,7 +534,7 @@ static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
 
 /* This is almost identical to the previous, end exists only because
  * yuy2ToY/UV)(dst, src + 1, ...) would have 100% unaligned accesses. */
-static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
+static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
                       uint32_t *unused)
 {
     int i;
@@ -418,7 +542,7 @@ static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
         dst[i] = src[2 * i + 1];
 }
 
-static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                        const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -440,14 +564,14 @@ static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
 }
 
 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *src1, const uint8_t *src2,
+                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
                        int width, uint32_t *unused)
 {
     nvXXtoUV_c(dstU, dstV, src1, width);
 }
 
 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
-                       const uint8_t *src1, const uint8_t *src2,
+                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
                        int width, uint32_t *unused)
 {
     nvXXtoUV_c(dstV, dstU, src1, width);
@@ -455,7 +579,7 @@ static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
 
 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
 
-static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
+static void bgr24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
                        int width, uint32_t *unused)
 {
     int i;
@@ -464,11 +588,11 @@ static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
         int g = src[i * 3 + 1];
         int r = src[i * 3 + 2];
 
-        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+        dst[i] = ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
     }
 }
 
-static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                         const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -477,13 +601,13 @@ static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
         int g = src1[3 * i + 1];
         int r = src1[3 * i + 2];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+        dstU[i] = (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+        dstV[i] = (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
     }
     assert(src1 == src2);
 }
 
-static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                              const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -492,13 +616,13 @@ static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
         int g = src1[6 * i + 1] + src1[6 * i + 4];
         int r = src1[6 * i + 2] + src1[6 * i + 5];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+        dstU[i] = (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+        dstV[i] = (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
     }
     assert(src1 == src2);
 }
 
-static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
+static void rgb24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
                        uint32_t *unused)
 {
     int i;
@@ -507,11 +631,11 @@ static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
         int g = src[i * 3 + 1];
         int b = src[i * 3 + 2];
 
-        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+        dst[i] = ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
     }
 }
 
-static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                         const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -521,12 +645,12 @@ static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
         int g = src1[3 * i + 1];
         int b = src1[3 * i + 2];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+        dstU[i] = (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+        dstV[i] = (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
     }
 }
 
-static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
+static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
                              const uint8_t *src2, int width, uint32_t *unused)
 {
     int i;
@@ -536,12 +660,12 @@ static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
         int g = src1[6 * i + 1] + src1[6 * i + 4];
         int b = src1[6 * i + 2] + src1[6 * i + 5];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
+        dstU[i] = (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+        dstV[i] = (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
     }
 }
 
-static void planar_rgb_to_y(uint8_t *dst, const uint8_t *src[4], int width)
+static void planar_rgb_to_y(uint16_t *dst, const uint8_t *src[4], int width)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -549,11 +673,11 @@ static void planar_rgb_to_y(uint8_t *dst, const uint8_t *src[4], int width)
         int b = src[1][i];
         int r = src[2][i];
 
-        dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
+        dst[i] = (RY*r + GY*g + BY*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
     }
 }
 
-static void planar_rgb_to_uv(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4], int width)
+static void planar_rgb_to_uv(uint16_t *dstU, uint16_t *dstV, const uint8_t *src[4], int width)
 {
     int i;
     for (i = 0; i < width; i++) {
@@ -561,8 +685,8 @@ static void planar_rgb_to_uv(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4]
         int b = src[1][i];
         int r = src[2][i];
 
-        dstU[i] = (RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
-        dstV[i] = (RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+        dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+        dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
     }
 }
 
@@ -742,6 +866,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     }
     if (c->chrSrcHSubSample) {
         switch (srcFormat) {
+        case PIX_FMT_RGBA64BE:
+            c->chrToYV12 = rgb64BEToUV_half_c;
+            break;
+        case PIX_FMT_RGBA64LE:
+            c->chrToYV12 = rgb64LEToUV_half_c;
+            break;
         case PIX_FMT_RGB48BE:
             c->chrToYV12 = rgb48BEToUV_half_c;
             break;
@@ -775,6 +905,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case PIX_FMT_BGR555BE:
             c->chrToYV12 = bgr15beToUV_half_c;
             break;
+        case PIX_FMT_GBR24P  :
+            c->chrToYV12 = gbr24pToUV_half_c;
+            break;
         case PIX_FMT_BGR444LE:
             c->chrToYV12 = bgr12leToUV_half_c;
             break;
@@ -811,6 +944,12 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         }
     } else {
         switch (srcFormat) {
+        case PIX_FMT_RGBA64BE:
+            c->chrToYV12 = rgb64BEToUV_c;
+            break;
+        case PIX_FMT_RGBA64LE:
+            c->chrToYV12 = rgb64LEToUV_c;
+            break;
         case PIX_FMT_RGB48BE:
             c->chrToYV12 = rgb48BEToUV_c;
             break;
@@ -1017,9 +1156,17 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
     case PIX_FMT_BGR48LE:
         c->lumToYV12 = bgr48LEToY_c;
         break;
+    case PIX_FMT_RGBA64BE:
+        c->lumToYV12 = rgb64BEToY_c;
+        break;
+    case PIX_FMT_RGBA64LE:
+        c->lumToYV12 = rgb64LEToY_c;
+        break;
     }
     if (c->alpPixBuf) {
         switch (srcFormat) {
+        case PIX_FMT_RGBA64LE:
+        case PIX_FMT_RGBA64BE:  c->alpToYV12 = rgba64ToA_c; break;
         case PIX_FMT_BGRA:
         case PIX_FMT_RGBA:
             c->alpToYV12 = rgbaToA_c;
@@ -1031,6 +1178,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
         case PIX_FMT_Y400A:
             c->alpToYV12 = uyvyToY_c;
             break;
+        case PIX_FMT_PAL8 :
+            c->alpToYV12 = palToA_c;
+            break;
         }
     }
 }
diff --git a/libswscale/options.c b/libswscale/options.c
index 7ed5254aa8..a396a58f40 100644
--- a/libswscale/options.c
+++ b/libswscale/options.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,7 +66,11 @@ static const AVOption options[] = {
     { NULL }
 };
 
-const AVClass sws_context_class = { "SWScaler", sws_context_to_name, options };
+const AVClass sws_context_class = {
+    .class_name = "SWScaler",
+    .item_name  = sws_context_to_name,
+    .option     = options,
+};
 
 const AVClass *sws_get_class(void)
 {
diff --git a/libswscale/output.c b/libswscale/output.c
index b58bda9213..c8365219f8 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -25,6 +25,7 @@
 #include <string.h>
 
 #include "libavutil/avutil.h"
+#include "libavutil/avassert.h"
 #include "libavutil/bswap.h"
 #include "libavutil/cpu.h"
 #include "libavutil/intreadwrite.h"
@@ -135,7 +136,8 @@ yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
                          int big_endian, int output_bits)
 {
     int i;
-    int shift = 19 - output_bits;
+    int shift = 3;
+    av_assert0(output_bits == 16);
 
     for (i = 0; i < dstW; i++) {
         int val = src[i] + (1 << (shift - 1));
@@ -149,10 +151,11 @@ yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
                          int big_endian, int output_bits)
 {
     int i;
-    int shift = 15 + 16 - output_bits;
+    int shift = 15;
+    av_assert0(output_bits == 16);
 
     for (i = 0; i < dstW; i++) {
-        int val = 1 << (30-output_bits);
+        int val = 1 << (shift - 1);
         int j;
 
         /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
@@ -199,7 +202,7 @@ yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
     int shift = 11 + 16 - output_bits;
 
     for (i = 0; i < dstW; i++) {
-        int val = 1 << (26-output_bits);
+        int val = 1 << (shift - 1);
         int j;
 
         for (j = 0; j < filterSize; j++)
@@ -392,14 +395,14 @@ yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
     for (i = 0; i < dstW; i += 8) {
         int acc = 0;
 
-        accumulate_bit(acc, (buf0[i + 0] >> 7) + d128[0]);
-        accumulate_bit(acc, (buf0[i + 1] >> 7) + d128[1]);
-        accumulate_bit(acc, (buf0[i + 2] >> 7) + d128[2]);
-        accumulate_bit(acc, (buf0[i + 3] >> 7) + d128[3]);
-        accumulate_bit(acc, (buf0[i + 4] >> 7) + d128[4]);
-        accumulate_bit(acc, (buf0[i + 5] >> 7) + d128[5]);
-        accumulate_bit(acc, (buf0[i + 6] >> 7) + d128[6]);
-        accumulate_bit(acc, (buf0[i + 7] >> 7) + d128[7]);
+        accumulate_bit(acc, ((buf0[i + 0] + 64) >> 7) + d128[0]);
+        accumulate_bit(acc, ((buf0[i + 1] + 64) >> 7) + d128[1]);
+        accumulate_bit(acc, ((buf0[i + 2] + 64) >> 7) + d128[2]);
+        accumulate_bit(acc, ((buf0[i + 3] + 64) >> 7) + d128[3]);
+        accumulate_bit(acc, ((buf0[i + 4] + 64) >> 7) + d128[4]);
+        accumulate_bit(acc, ((buf0[i + 5] + 64) >> 7) + d128[5]);
+        accumulate_bit(acc, ((buf0[i + 6] + 64) >> 7) + d128[6]);
+        accumulate_bit(acc, ((buf0[i + 7] + 64) >> 7) + d128[7]);
 
         output_pixel(*dest++, acc);
     }
@@ -515,10 +518,12 @@ yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 
-        Y1 = av_clip_uint8(Y1);
-        Y2 = av_clip_uint8(Y2);
-        U  = av_clip_uint8(U);
-        V  = av_clip_uint8(V);
+        if ((Y1 | Y2 | U | V) & 0x100) {
+            Y1 = av_clip_uint8(Y1);
+            Y2 = av_clip_uint8(Y2);
+            U  = av_clip_uint8(U);
+            V  = av_clip_uint8(V);
+        }
 
         output_pixels(i * 4, Y1, U, Y2, V);
     }
@@ -535,10 +540,17 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 
     if (uvalpha < 2048) {
         for (i = 0; i < ((dstW + 1) >> 1); i++) {
-            int Y1 = buf0[i * 2]     >> 7;
-            int Y2 = buf0[i * 2 + 1] >> 7;
-            int U  = ubuf0[i]        >> 7;
-            int V  = vbuf0[i]        >> 7;
+            int Y1 = (buf0[i * 2    ]+64) >> 7;
+            int Y2 = (buf0[i * 2 + 1]+64) >> 7;
+            int U  = (ubuf0[i]       +64) >> 7;
+            int V  = (vbuf0[i]       +64) >> 7;
+
+            if ((Y1 | Y2 | U | V) & 0x100) {
+                Y1 = av_clip_uint8(Y1);
+                Y2 = av_clip_uint8(Y2);
+                U  = av_clip_uint8(U);
+                V  = av_clip_uint8(V);
+            }
 
             Y1 = av_clip_uint8(Y1);
             Y2 = av_clip_uint8(Y2);
@@ -550,10 +562,17 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
     } else {
         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
         for (i = 0; i < ((dstW + 1) >> 1); i++) {
-            int Y1 =  buf0[i * 2]          >> 7;
-            int Y2 =  buf0[i * 2 + 1]      >> 7;
-            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
-            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
+            int Y1 = (buf0[i * 2    ]    + 64) >> 7;
+            int Y2 = (buf0[i * 2 + 1]    + 64) >> 7;
+            int U  = (ubuf0[i] + ubuf1[i]+128) >> 8;
+            int V  = (vbuf0[i] + vbuf1[i]+128) >> 8;
+
+            if ((Y1 | Y2 | U | V) & 0x100) {
+                Y1 = av_clip_uint8(Y1);
+                Y2 = av_clip_uint8(Y2);
+                U  = av_clip_uint8(U);
+                V  = av_clip_uint8(V);
+            }
 
             Y1 = av_clip_uint8(Y1);
             Y2 = av_clip_uint8(Y2);
@@ -811,7 +830,7 @@ YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE)
  * correct RGB values into the destination buffer.
  */
 static av_always_inline void
-yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
+yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
               unsigned A1, unsigned A2,
               const void *_r, const void *_g, const void *_b, int y,
               enum PixelFormat target, int hasAlpha)
@@ -847,6 +866,7 @@ yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
 
 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
+
         dest[i * 6 + 0] = r_b[Y1];
         dest[i * 6 + 1] =   g[Y1];
         dest[i * 6 + 2] = b_r[Y1];
@@ -952,12 +972,6 @@ yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
         Y2 >>= 19;
         U  >>= 19;
         V  >>= 19;
-        if ((Y1 | Y2 | U | V) & 0x100) {
-            Y1 = av_clip_uint8(Y1);
-            Y2 = av_clip_uint8(Y2);
-            U  = av_clip_uint8(U);
-            V  = av_clip_uint8(V);
-        }
         if (hasAlpha) {
             A1 = 1 << 18;
             A2 = 1 << 18;
@@ -973,10 +987,9 @@ yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
             }
         }
 
-        /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
-        r =  c->table_rV[V];
-        g = (c->table_gU[U] + c->table_gV[V]);
-        b =  c->table_bU[U];
+        r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM];
+        g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
+        b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
         yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
                       r, g, b, y, target, hasAlpha);
@@ -1005,16 +1018,9 @@ yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
         int A1, A2;
-        const void *r, *g, *b;
-
-        Y1 = av_clip_uint8(Y1);
-        Y2 = av_clip_uint8(Y2);
-        U  = av_clip_uint8(U);
-        V  = av_clip_uint8(V);
-
-        r =  c->table_rV[V];
-        g = (c->table_gU[U] + c->table_gV[V]);
-        b =  c->table_bU[U];
+        const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+                   *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+                   *b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
         if (hasAlpha) {
             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
@@ -1040,25 +1046,18 @@ yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
 
     if (uvalpha < 2048) {
         for (i = 0; i < ((dstW + 1) >> 1); i++) {
-            int Y1 = buf0[i * 2]     >> 7;
-            int Y2 = buf0[i * 2 + 1] >> 7;
-            int U  = ubuf0[i]        >> 7;
-            int V  = vbuf0[i]        >> 7;
+            int Y1 = (buf0[i * 2    ] + 64) >> 7;
+            int Y2 = (buf0[i * 2 + 1] + 64) >> 7;
+            int U  = (ubuf0[i]        + 64) >> 7;
+            int V  = (vbuf0[i]        + 64) >> 7;
             int A1, A2;
-            const void *r, *g, *b;
-
-            Y1 = av_clip_uint8(Y1);
-            Y2 = av_clip_uint8(Y2);
-            U  = av_clip_uint8(U);
-            V  = av_clip_uint8(V);
-
-            r =  c->table_rV[V];
-            g = (c->table_gU[U] + c->table_gV[V]);
-            b =  c->table_bU[U];
+            const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+                       *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+                       *b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
             if (hasAlpha) {
-                A1 = abuf0[i * 2    ] >> 7;
-                A2 = abuf0[i * 2 + 1] >> 7;
+                A1 = abuf0[i * 2    ] * 255 + 16384 >> 15;
+                A2 = abuf0[i * 2 + 1] * 255 + 16384 >> 15;
                 A1 = av_clip_uint8(A1);
                 A2 = av_clip_uint8(A2);
             }
@@ -1069,25 +1068,18 @@ yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
     } else {
         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
         for (i = 0; i < ((dstW + 1) >> 1); i++) {
-            int Y1 =  buf0[i * 2]          >> 7;
-            int Y2 =  buf0[i * 2 + 1]      >> 7;
-            int U  = (ubuf0[i] + ubuf1[i]) >> 8;
-            int V  = (vbuf0[i] + vbuf1[i]) >> 8;
+            int Y1 = (buf0[i * 2    ]     +  64) >> 7;
+            int Y2 = (buf0[i * 2 + 1]     +  64) >> 7;
+            int U  = (ubuf0[i] + ubuf1[i] + 128) >> 8;
+            int V  = (vbuf0[i] + vbuf1[i] + 128) >> 8;
             int A1, A2;
-            const void *r, *g, *b;
-
-            Y1 = av_clip_uint8(Y1);
-            Y2 = av_clip_uint8(Y2);
-            U  = av_clip_uint8(U);
-            V  = av_clip_uint8(V);
-
-            r =  c->table_rV[V];
-            g = (c->table_gU[U] + c->table_gV[V]);
-            b =  c->table_bU[U];
+            const void *r =  c->table_rV[V + YUVRGB_TABLE_HEADROOM],
+                       *g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
+                       *b =  c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
             if (hasAlpha) {
-                A1 = abuf0[i * 2    ] >> 7;
-                A2 = abuf0[i * 2 + 1] >> 7;
+                A1 = (abuf0[i * 2    ] + 64) >> 7;
+                A2 = (abuf0[i * 2 + 1] + 64) >> 7;
                 A1 = av_clip_uint8(A1);
                 A2 = av_clip_uint8(A2);
             }
@@ -1163,9 +1155,9 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
 
     for (i = 0; i < dstW; i++) {
         int j;
-        int Y = 0;
-        int U = -128 << 19;
-        int V = -128 << 19;
+        int Y = 1<<9;
+        int U = (1<<9)-(128 << 19);
+        int V = (1<<9)-(128 << 19);
         int R, G, B, A;
 
         for (j = 0; j < lumFilterSize; j++) {
@@ -1179,7 +1171,7 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
         U >>= 10;
         V >>= 10;
         if (hasAlpha) {
-            A = 1 << 21;
+            A = 1 << 18;
             for (j = 0; j < lumFilterSize; j++) {
                 A += alpSrc[j][i] * lumFilter[j];
             }
@@ -1222,7 +1214,6 @@ yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
             dest[1] = B >> 22;
             dest[2] = G >> 22;
             dest[3] = R >> 22;
-            dest += 4;
             break;
         case PIX_FMT_BGR24:
             dest[0] = B >> 22;
@@ -1353,7 +1344,10 @@ void ff_sws_init_output_funcs(SwsContext *c,
             *yuv2packedX = yuv2bgr24_full_X_c;
             break;
         }
+        if(!*yuv2packedX)
+            goto YUV_PACKED;
     } else {
+        YUV_PACKED:
         switch (dstFormat) {
         case PIX_FMT_RGB48LE:
             *yuv2packed1 = yuv2rgb48le_1_c;
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 5537707bd0..df8afb0ba2 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  * based on the equivalent C code in swscale.c
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/ppc/yuv2rgb_altivec.c b/libswscale/ppc/yuv2rgb_altivec.c
index 3a140cb1f8..5c2e2c8196 100644
--- a/libswscale/ppc/yuv2rgb_altivec.c
+++ b/libswscale/ppc/yuv2rgb_altivec.c
@@ -3,20 +3,20 @@
  *
  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -96,6 +96,7 @@
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 #include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
 #include "yuv2rgb_altivec.h"
 
 #undef PROFILE_THE_BEAST
@@ -733,7 +734,7 @@ static av_always_inline void ff_yuv2packedX_altivec(SwsContext *c,
             if (!printed_error_message) {
                 av_log(c, AV_LOG_ERROR,
                        "altivec_yuv2packedX doesn't support %s output\n",
-                       sws_format_name(c->dstFormat));
+                       av_get_pix_fmt_name(c->dstFormat));
                 printed_error_message = 1;
             }
             return;
@@ -821,7 +822,7 @@ static av_always_inline void ff_yuv2packedX_altivec(SwsContext *c,
             /* Unreachable, I think. */
             av_log(c, AV_LOG_ERROR,
                    "altivec_yuv2packedX doesn't support %s output\n",
-                   sws_format_name(c->dstFormat));
+                   av_get_pix_fmt_name(c->dstFormat));
             return;
         }
 
diff --git a/libswscale/ppc/yuv2rgb_altivec.h b/libswscale/ppc/yuv2rgb_altivec.h
index 2c5e7ed876..aa52a4743e 100644
--- a/libswscale/ppc/yuv2rgb_altivec.h
+++ b/libswscale/ppc/yuv2rgb_altivec.h
@@ -4,20 +4,20 @@
  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  * based on the equivalent C code in swscale.c
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/ppc/yuv2yuv_altivec.c b/libswscale/ppc/yuv2yuv_altivec.c
index 45d766bd02..af78782b54 100644
--- a/libswscale/ppc/yuv2yuv_altivec.c
+++ b/libswscale/ppc/yuv2yuv_altivec.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  * based on the equivalent C code in swscale.c
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index 14b595f107..afc8d1f3a1 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -6,26 +6,27 @@
  * Written by Nick Kurshev.
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <inttypes.h>
 
 #include "libavutil/bswap.h"
+#include "libavutil/intreadwrite.h"
 #include "config.h"
 #include "rgb2rgb.h"
 #include "swscale.h"
@@ -180,13 +181,13 @@ void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
         register uint16_t bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0xF800) >> 8;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
 #else
-        *d++ = (bgr & 0xF800) >> 8;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
         *d++ = 255;
 #endif
     }
@@ -219,9 +220,9 @@ void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size)
 
     while (s < end) {
         register uint16_t bgr = *s++;
-        *d++ = (bgr & 0xF800) >> 8;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
     }
 }
 
@@ -255,13 +256,13 @@ void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
         register uint16_t bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x7C00) >> 7;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
 #else
-        *d++ = (bgr & 0x7C00) >> 7;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
         *d++ = 255;
 #endif
     }
@@ -275,9 +276,9 @@ void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size)
 
     while (s < end) {
         register uint16_t bgr = *s++;
-        *d++ = (bgr & 0x7C00) >> 7;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
     }
 }
 
@@ -314,18 +315,6 @@ void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size)
     }
 }
 
-void bgr8torgb8(const uint8_t *src, uint8_t *dst, int src_size)
-{
-    int i, num_pixels = src_size;
-
-    for (i = 0; i < num_pixels; i++) {
-        register uint8_t rgb = src[i];
-        unsigned r           = (rgb & 0x07);
-        unsigned g           = (rgb & 0x38) >> 3;
-        unsigned b           = (rgb & 0xC0) >> 6;
-        dst[i]               = ((b << 1) & 0x07) | ((g & 0x07) << 3) | ((r & 0x03) << 6);
-    }
-}
 
 #define DEFINE_SHUFFLE_BYTES(a, b, c, d)                                \
 void shuffle_bytes_ ## a ## b ## c ## d(const uint8_t *src,             \
@@ -345,3 +334,21 @@ DEFINE_SHUFFLE_BYTES(0, 3, 2, 1)
 DEFINE_SHUFFLE_BYTES(1, 2, 3, 0)
 DEFINE_SHUFFLE_BYTES(3, 0, 1, 2)
 DEFINE_SHUFFLE_BYTES(3, 2, 1, 0)
+
+#define DEFINE_RGB48TOBGR48(ie, oe)                                     \
+void rgb48tobgr48_ ## ie ## oe(const uint8_t *src,                      \
+                               uint8_t *dst, int src_size)              \
+{                                                                       \
+    uint16_t *d = (uint16_t *)dst;                                      \
+    uint16_t *s = (uint16_t *)src;                                      \
+    int i, num_pixels = src_size >> 1;                                  \
+                                                                        \
+    for (i = 0; i < num_pixels; i += 3) {                               \
+        AV_W ## oe ## 16(&d[i    ], AV_R ## ie ## 16(&s[i + 2]));       \
+        AV_W ## oe ## 16(&d[i + 1], AV_R ## ie ## 16(&s[i + 1]));       \
+        AV_W ## oe ## 16(&d[i + 2], AV_R ## ie ## 16(&s[i    ]));       \
+    }                                                                   \
+}
+
+DEFINE_RGB48TOBGR48(L, L)
+DEFINE_RGB48TOBGR48(L, B)
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 42f468fe21..6e3be2a573 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -6,20 +6,20 @@
  *  Written by Nick Kurshev.
  *  YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -52,6 +52,8 @@ extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
 
 extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
 
+void rgb48tobgr48_LL(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb48tobgr48_LB(const uint8_t *src, uint8_t *dst, int src_size);
 void    rgb24to32(const uint8_t *src, uint8_t *dst, int src_size);
 void    rgb32to24(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
@@ -64,7 +66,6 @@ void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size);
 void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
-void   bgr8torgb8(const uint8_t *src, uint8_t *dst, int src_size);
 
 void shuffle_bytes_0321(const uint8_t *src, uint8_t *dst, int src_size);
 void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size);
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index d1a43e01cb..c05cdc8549 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -7,20 +7,20 @@
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  * lot of big-endian byte order fixes by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -238,27 +238,6 @@ static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, int src_size)
     }
 }
 
-/*
- * I use less accurate approximation here by simply left-shifting the input
- * value and filling the low order bits with zeroes. This method improves PNG
- * compression but this scheme cannot reproduce white exactly, since it does
- * not generate an all-ones maximum value; the net effect is to darken the
- * image slightly.
- *
- * The better method should be "left bit replication":
- *
- *  4 3 2 1 0
- *  ---------
- *  1 1 0 1 1
- *
- *  7 6 5 4 3  2 1 0
- *  ----------------
- *  1 1 0 1 1  1 1 0
- *  |=======|  |===|
- *      |      leftmost bits repeated to fill open bits
- *      |
- *  original bits
- */
 static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst,
                                   int src_size)
 {
@@ -268,9 +247,9 @@ static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst,
 
     while (s < end) {
         register uint16_t bgr = *s++;
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x7C00) >> 7;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
     }
 }
 
@@ -283,9 +262,9 @@ static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst,
 
     while (s < end) {
         register uint16_t bgr = *s++;
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0xF800) >> 8;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
     }
 }
 
@@ -299,13 +278,13 @@ static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, int src_size)
         register uint16_t bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr & 0x7C00) >> 7;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
 #else
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x3E0)  >> 2;
-        *d++ = (bgr & 0x7C00) >> 7;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
         *d++ = 255;
 #endif
     }
@@ -321,13 +300,13 @@ static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, int src_size)
         register uint16_t bgr = *s++;
 #if HAVE_BIGENDIAN
         *d++ = 255;
-        *d++ = (bgr & 0xF800) >> 8;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0x1F)   << 3;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
 #else
-        *d++ = (bgr & 0x1F)   << 3;
-        *d++ = (bgr & 0x7E0)  >> 3;
-        *d++ = (bgr & 0xF800) >> 8;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
         *d++ = 255;
 #endif
     }
@@ -665,6 +644,9 @@ void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
         ydst += lumStride;
         src  += srcStride;
 
+        if (y+1 == height)
+            break;
+
         for (i = 0; i < chromWidth; i++) {
             unsigned int b = src[6 * i + 0];
             unsigned int g = src[6 * i + 1];
diff --git a/libswscale/sparc/yuv2rgb_vis.c b/libswscale/sparc/yuv2rgb_vis.c
index bcd2081aa8..62f502619c 100644
--- a/libswscale/sparc/yuv2rgb_vis.c
+++ b/libswscale/sparc/yuv2rgb_vis.c
@@ -2,20 +2,20 @@
  * VIS optimized software YUV to RGB converter
  * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/swscale-test.c b/libswscale/swscale-test.c
index 3497dffbe0..ef6c55ce02 100644
--- a/libswscale/swscale-test.c
+++ b/libswscale/swscale-test.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -103,6 +103,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
 
         av_image_fill_linesizes(srcStride, srcFormat, srcW);
         for (p = 0; p < 4; p++) {
+            srcStride[p] = FFALIGN(srcStride[p], 16);
             if (srcStride[p])
                 src[p] = av_mallocz(srcStride[p] * srcH + 16);
             if (srcStride[p] && !src[p]) {
@@ -136,6 +137,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
          * allocated with av_malloc). */
         /* An extra 16 bytes is being allocated because some scalers may write
          * out of bounds. */
+        dstStride[i] = FFALIGN(dstStride[i], 16);
         if (dstStride[i])
             dst[i] = av_mallocz(dstStride[i] * dstH + 16);
         if (dstStride[i] && !dst[i]) {
@@ -175,6 +177,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
         ssdA = r->ssdA;
     } else {
         for (i = 0; i < 4; i++) {
+            refStride[i] = FFALIGN(refStride[i], 16);
             if (refStride[i])
                 out[i] = av_mallocz(refStride[i] * h);
             if (refStride[i] && !out[i]) {
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 7ae5af37a8..5dd4124eba 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/avutil.h"
 #include "libavutil/bswap.h"
 #include "libavutil/cpu.h"
@@ -71,6 +72,9 @@ static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW,
     int bits            = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
     int sh              = bits - 4;
 
+    if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
+        sh= 9;
+
     for (i = 0; i < dstW; i++) {
         int j;
         int srcPos = filterPos[i];
@@ -92,6 +96,9 @@ static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW,
     const uint16_t *src = (const uint16_t *) _src;
     int sh              = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
 
+    if(sh<15)
+        sh= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
+
     for (i = 0; i < dstW; i++) {
         int j;
         int srcPos = filterPos[i];
@@ -208,7 +215,7 @@ static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
     int i;
     int32_t *dst = (int32_t *) _dst;
     for (i = 0; i < width; i++)
-        dst[i] = (dst[i] * 14071 + (33561947 << 4)) >> 14;
+        dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
 }
 
 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
@@ -222,6 +229,8 @@ static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
         dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha;
         xpos  += xInc;
     }
+    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+        dst[i] = src[srcW-1]*128;
 }
 
 // *** horizontal scale Y line to temp buffer
@@ -234,13 +243,13 @@ static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
                                      uint8_t *formatConvBuffer,
                                      uint32_t *pal, int isAlpha)
 {
-    void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) =
+    void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) =
         isAlpha ? c->alpToYV12 : c->lumToYV12;
     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
     const uint8_t *src = src_in[isAlpha ? 3 : 0];
 
     if (toYV12) {
-        toYV12(formatConvBuffer, src, srcW, pal);
+        toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal);
         src = formatConvBuffer;
     } else if (c->readLumPlanar && !isAlpha) {
         c->readLumPlanar(formatConvBuffer, src_in, srcW);
@@ -271,6 +280,10 @@ static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
         dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha);
         xpos   += xInc;
     }
+    for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
+        dst1[i] = src1[srcW-1]*128;
+        dst2[i] = src2[srcW-1]*128;
+    }
 }
 
 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1,
@@ -285,13 +298,13 @@ static av_always_inline void hcscale(SwsContext *c, int16_t *dst1,
     const uint8_t *src1 = src_in[1], *src2 = src_in[2];
     if (c->chrToYV12) {
         uint8_t *buf2 = formatConvBuffer +
-                        FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
-        c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
-        src1 = formatConvBuffer;
-        src2 = buf2;
+                        FFALIGN(srcW*2+78, 16);
+        c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal);
+        src1= formatConvBuffer;
+        src2= buf2;
     } else if (c->readChrPlanar) {
         uint8_t *buf2 = formatConvBuffer +
-                        FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
+                        FFALIGN(srcW*2+78, 16);
         c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
         src1 = formatConvBuffer;
         src2 = buf2;
@@ -392,8 +405,8 @@ static int swScale(SwsContext *c, const uint8_t *src[],
     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
                   vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
 
-    if (dstStride[0] % 8 != 0 || dstStride[1] % 8 != 0 ||
-        dstStride[2] % 8 != 0 || dstStride[3] % 8 != 0) {
+    if (dstStride[0]%16 !=0 || dstStride[1]%16 !=0 ||
+        dstStride[2]%16 !=0 || dstStride[3]%16 != 0) {
         static int warnedAlready = 0; // FIXME maybe move this into the context
         if (flags & SWS_PRINT_INFO && !warnedAlready) {
             av_log(c, AV_LOG_WARNING,
@@ -403,6 +416,18 @@ static int swScale(SwsContext *c, const uint8_t *src[],
         }
     }
 
+    if ((int)dst[0]%16 || (int)dst[1]%16 || (int)dst[2]%16 || (int)src[0]%16 || (int)src[1]%16 || (int)src[2]%16
+        || dstStride[0]%16 || dstStride[1]%16 || dstStride[2]%16 || dstStride[3]%16
+        || srcStride[0]%16 || srcStride[1]%16 || srcStride[2]%16 || srcStride[3]%16
+    ) {
+        static int warnedAlready=0;
+        int cpu_flags = av_get_cpu_flags();
+        if (HAVE_MMX2 && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
+            av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n");
+            warnedAlready=1;
+        }
+    }
+
     /* Note the user might start scaling the picture in the middle so this
      * will not get executed. This is not really intended but works
      * currently, so people might do it. */
@@ -427,6 +452,7 @@ static int swScale(SwsContext *c, const uint8_t *src[],
             dst[2] + dstStride[2] * chrDstY,
             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
         };
+        int use_mmx_vfilter= c->use_mmx_vfilter;
 
         // First line needed as input
         const int firstLumSrcY  = FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]);
@@ -531,98 +557,74 @@ static int swScale(SwsContext *c, const uint8_t *src[],
              * this array's tail */
             ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
                                      &yuv2packed1, &yuv2packed2, &yuv2packedX);
+            use_mmx_vfilter= 0;
         }
 
         {
-            const int16_t **lumSrcPtr  = (const int16_t **)lumPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-            const int16_t **chrUSrcPtr = (const int16_t **)chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-            const int16_t **chrVSrcPtr = (const int16_t **)chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+            const int16_t **lumSrcPtr  = (const int16_t **)(void*) lumPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+            const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+            const int16_t **chrVSrcPtr = (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
             const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
-                                         (const int16_t **)alpPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
-
-            if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
-                const int16_t **tmpY = (const int16_t **)lumPixBuf +
-                                       2 * vLumBufSize;
-                int neg = -firstLumSrcY, i;
-                int end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
-                for (i = 0; i < neg; i++)
-                    tmpY[i] = lumSrcPtr[neg];
-                for (; i < end; i++)
-                    tmpY[i] = lumSrcPtr[i];
-                for (; i < vLumFilterSize; i++)
-                    tmpY[i] = tmpY[i - 1];
-                lumSrcPtr = tmpY;
-
-                if (alpSrcPtr) {
-                    const int16_t **tmpA = (const int16_t **)alpPixBuf +
-                                           2 * vLumBufSize;
-                    for (i = 0; i < neg; i++)
-                        tmpA[i] = alpSrcPtr[neg];
-                    for (; i < end; i++)
-                        tmpA[i] = alpSrcPtr[i];
-                    for (; i < vLumFilterSize; i++)
-                        tmpA[i] = tmpA[i - 1];
-                    alpSrcPtr = tmpA;
-                }
-            }
-            if (firstChrSrcY < 0 ||
-                firstChrSrcY + vChrFilterSize > c->chrSrcH) {
-                const int16_t **tmpU = (const int16_t **)chrUPixBuf + 2 * vChrBufSize,
-                **tmpV               = (const int16_t **)chrVPixBuf + 2 * vChrBufSize;
-                int neg = -firstChrSrcY, i;
-                int end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
-                for (i = 0; i < neg; i++) {
-                    tmpU[i] = chrUSrcPtr[neg];
-                    tmpV[i] = chrVSrcPtr[neg];
-                }
-                for (; i < end; i++) {
-                    tmpU[i] = chrUSrcPtr[i];
-                    tmpV[i] = chrVSrcPtr[i];
-                }
-                for (; i < vChrFilterSize; i++) {
-                    tmpU[i] = tmpU[i - 1];
-                    tmpV[i] = tmpV[i - 1];
-                }
-                chrUSrcPtr = tmpU;
-                chrVSrcPtr = tmpV;
-            }
+                                         (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+            int16_t *vLumFilter = c->vLumFilter;
+            int16_t *vChrFilter = c->vChrFilter;
 
             if (isPlanarYUV(dstFormat) ||
                 (isGray(dstFormat) && !isALPHA(dstFormat))) { // YV12 like
                 const int chrSkipMask = (1 << c->chrDstVSubSample) - 1;
 
+                vLumFilter +=    dstY * vLumFilterSize;
+                vChrFilter += chrDstY * vChrFilterSize;
+
+//                 av_assert0(use_mmx_vfilter != (
+//                                yuv2planeX == yuv2planeX_10BE_c
+//                             || yuv2planeX == yuv2planeX_10LE_c
+//                             || yuv2planeX == yuv2planeX_9BE_c
+//                             || yuv2planeX == yuv2planeX_9LE_c
+//                             || yuv2planeX == yuv2planeX_16BE_c
+//                             || yuv2planeX == yuv2planeX_16LE_c
+//                             || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
+
+                if(use_mmx_vfilter){
+                    vLumFilter= c->lumMmxFilter;
+                    vChrFilter= c->chrMmxFilter;
+                }
+
                 if (vLumFilterSize == 1) {
                     yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
                 } else {
-                    yuv2planeX(vLumFilter + dstY * vLumFilterSize,
-                               vLumFilterSize, lumSrcPtr, dest[0],
+                    yuv2planeX(vLumFilter, vLumFilterSize,
+                               lumSrcPtr, dest[0],
                                dstW, c->lumDither8, 0);
                 }
 
                 if (!((dstY & chrSkipMask) || isGray(dstFormat))) {
                     if (yuv2nv12cX) {
-                        yuv2nv12cX(c, vChrFilter + chrDstY * vChrFilterSize,
+                        yuv2nv12cX(c, vChrFilter,
                                    vChrFilterSize, chrUSrcPtr, chrVSrcPtr,
                                    dest[1], chrDstW);
                     } else if (vChrFilterSize == 1) {
                         yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
                         yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
                     } else {
-                        yuv2planeX(vChrFilter + chrDstY * vChrFilterSize,
+                        yuv2planeX(vChrFilter,
                                    vChrFilterSize, chrUSrcPtr, dest[1],
                                    chrDstW, c->chrDither8, 0);
-                        yuv2planeX(vChrFilter + chrDstY * vChrFilterSize,
+                        yuv2planeX(vChrFilter,
                                    vChrFilterSize, chrVSrcPtr, dest[2],
-                                   chrDstW, c->chrDither8, 3);
+                                   chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
                     }
                 }
 
                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                    if(use_mmx_vfilter){
+                        vLumFilter= c->alpMmxFilter;
+                    }
                     if (vLumFilterSize == 1) {
                         yuv2plane1(alpSrcPtr[0], dest[3], dstW,
                                    c->lumDither8, 0);
                     } else {
-                        yuv2planeX(vLumFilter + dstY * vLumFilterSize,
+                        yuv2planeX(vLumFilter,
                                    vLumFilterSize, alpSrcPtr, dest[3],
                                    dstW, c->lumDither8, 0);
                     }
@@ -687,6 +689,7 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
 
     ff_sws_init_input_funcs(c);
 
+
     if (c->srcBpc == 8) {
         if (c->dstBpc <= 10) {
             c->hyScale = c->hcScale = hScale8To15_c;
diff --git a/libswscale/swscale.h b/libswscale/swscale.h
index 30c87be760..2b3fc5ccd5 100644
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 2a7d2dbd1d..ae53b88e7f 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -34,10 +34,14 @@
 
 #define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
 
+#define YUVRGB_TABLE_HEADROOM 128
+
 #define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
 
 #define MAX_FILTER_SIZE 256
 
+#define DITHER1XBPP
+
 #if HAVE_BIGENDIAN
 #define ALT32_CORR (-1)
 #else
@@ -315,10 +319,10 @@ typedef struct SwsContext {
     int dstY;                     ///< Last destination vertical line output from last slice.
     int flags;                    ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
     void *yuvTable;             // pointer to the yuv->rgb table start so it can be freed()
-    uint8_t *table_rV[256];
-    uint8_t *table_gU[256];
-    int table_gV[256];
-    uint8_t *table_bU[256];
+    uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
+    int table_gV[256 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
 
     //Colorspace stuff
     int contrast, brightness, saturation;    // for sws_getColorspaceDetails
@@ -326,6 +330,8 @@ typedef struct SwsContext {
     int dstColorspaceTable[4];
     int srcRange;                 ///< 0 = MPG YUV range, 1 = JPG YUV range (source      image).
     int dstRange;                 ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image).
+    int src0Alpha;
+    int dst0Alpha;
     int yuv2rgb_y_offset;
     int yuv2rgb_y_coeff;
     int yuv2rgb_v2r_coeff;
@@ -382,8 +388,8 @@ typedef struct SwsContext {
     // alignment of these values is not necessary, but merely here
     // to maintain the same offset across x8632 and x86-64. Once we
     // use proper offset macros in the asm, they can be removed.
-    DECLARE_ALIGNED(8, ptrdiff_t, uv_off_px);   ///< offset (in pixels) between u and v planes
-    DECLARE_ALIGNED(8, ptrdiff_t, uv_off_byte); ///< offset (in bytes) between u and v planes
+    DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
+    DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes
     DECLARE_ALIGNED(8, uint16_t, dither16)[8];
     DECLARE_ALIGNED(8, uint32_t, dither32)[8];
 
@@ -417,6 +423,7 @@ typedef struct SwsContext {
 #if HAVE_VIS
     DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10];
 #endif
+    int use_mmx_vfilter;
 
     /* function pointers for swScale() */
     yuv2planar1_fn yuv2plane1;
@@ -427,14 +434,14 @@ typedef struct SwsContext {
     yuv2packedX_fn yuv2packedX;
 
     /// Unscaled conversion of luma plane to YV12 for horizontal scaler.
-    void (*lumToYV12)(uint8_t *dst, const uint8_t *src,
+    void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
                       int width, uint32_t *pal);
     /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
-    void (*alpToYV12)(uint8_t *dst, const uint8_t *src,
+    void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
                       int width, uint32_t *pal);
     /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
     void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
-                      const uint8_t *src1, const uint8_t *src2,
+                      const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
                       int width, uint32_t *pal);
 
     /**
@@ -539,7 +546,13 @@ SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c);
 SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c);
 void ff_bfin_get_unscaled_swscale(SwsContext *c);
 
+#if FF_API_SWS_FORMAT_NAME
+/**
+ * @deprecated Use av_get_pix_fmt_name() instead.
+ */
+attribute_deprecated
 const char *sws_format_name(enum PixelFormat format);
+#endif
 
 #define is16BPS(x) \
     (av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 15)
@@ -548,6 +561,8 @@ const char *sws_format_name(enum PixelFormat format);
     (av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 8 || \
      av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 9)
 
+#define isNBPS(x) is9_OR_10BPS(x)
+
 #define isBE(x) \
     (av_pix_fmt_descriptors[x].flags & PIX_FMT_BE)
 
@@ -561,7 +576,6 @@ const char *sws_format_name(enum PixelFormat format);
 
 #define isRGB(x) \
     (av_pix_fmt_descriptors[x].flags & PIX_FMT_RGB)
-
 #if 0 // FIXME
 #define isGray(x) \
     (!(av_pix_fmt_descriptors[x].flags & PIX_FMT_PAL) && \
@@ -574,55 +588,95 @@ const char *sws_format_name(enum PixelFormat format);
      (x) == PIX_FMT_GRAY16LE)
 #endif
 
-#define isRGBinInt(x)                  \
-    ((x) == PIX_FMT_RGB48BE     ||     \
-     (x) == PIX_FMT_RGB48LE     ||     \
-     (x) == PIX_FMT_RGB32       ||     \
-     (x) == PIX_FMT_RGB32_1     ||     \
-     (x) == PIX_FMT_RGB24       ||     \
-     (x) == PIX_FMT_RGB565BE    ||     \
-     (x) == PIX_FMT_RGB565LE    ||     \
-     (x) == PIX_FMT_RGB555BE    ||     \
-     (x) == PIX_FMT_RGB555LE    ||     \
-     (x) == PIX_FMT_RGB444BE    ||     \
-     (x) == PIX_FMT_RGB444LE    ||     \
-     (x) == PIX_FMT_RGB8        ||     \
-     (x) == PIX_FMT_RGB4        ||     \
-     (x) == PIX_FMT_RGB4_BYTE   ||     \
-     (x) == PIX_FMT_MONOBLACK   ||     \
-     (x) == PIX_FMT_MONOWHITE)
-
-#define isBGRinInt(x)                  \
-    ((x) == PIX_FMT_BGR48BE     ||     \
-     (x) == PIX_FMT_BGR48LE     ||     \
-     (x) == PIX_FMT_BGR32       ||     \
-     (x) == PIX_FMT_BGR32_1     ||     \
-     (x) == PIX_FMT_BGR24       ||     \
-     (x) == PIX_FMT_BGR565BE    ||     \
-     (x) == PIX_FMT_BGR565LE    ||     \
-     (x) == PIX_FMT_BGR555BE    ||     \
-     (x) == PIX_FMT_BGR555LE    ||     \
-     (x) == PIX_FMT_BGR444BE    ||     \
-     (x) == PIX_FMT_BGR444LE    ||     \
-     (x) == PIX_FMT_BGR8        ||     \
-     (x) == PIX_FMT_BGR4        ||     \
-     (x) == PIX_FMT_BGR4_BYTE   ||     \
-     (x) == PIX_FMT_MONOBLACK   ||     \
-     (x) == PIX_FMT_MONOWHITE)
-
-#define isAnyRGB(x)                    \
-    (isRGBinInt(x)              ||     \
-     isBGRinInt(x))
+#define isRGBinInt(x) \
+    (           \
+        (x)==PIX_FMT_RGB48BE   ||   \
+        (x)==PIX_FMT_RGB48LE   ||   \
+        (x)==PIX_FMT_RGBA64BE  ||   \
+        (x)==PIX_FMT_RGBA64LE  ||   \
+        (x)==PIX_FMT_RGB32     ||   \
+        (x)==PIX_FMT_RGB32_1   ||   \
+        (x)==PIX_FMT_RGB24     ||   \
+        (x)==PIX_FMT_RGB565BE  ||   \
+        (x)==PIX_FMT_RGB565LE  ||   \
+        (x)==PIX_FMT_RGB555BE  ||   \
+        (x)==PIX_FMT_RGB555LE  ||   \
+        (x)==PIX_FMT_RGB444BE  ||   \
+        (x)==PIX_FMT_RGB444LE  ||   \
+        (x)==PIX_FMT_RGB8      ||   \
+        (x)==PIX_FMT_RGB4      ||   \
+        (x)==PIX_FMT_RGB4_BYTE ||   \
+        (x)==PIX_FMT_MONOBLACK ||   \
+        (x)==PIX_FMT_MONOWHITE   \
+    )
+#define isBGRinInt(x) \
+    (           \
+         (x)==PIX_FMT_BGR48BE  ||   \
+         (x)==PIX_FMT_BGR48LE  ||   \
+         (x)==PIX_FMT_BGRA64BE ||   \
+         (x)==PIX_FMT_BGRA64LE ||   \
+         (x)==PIX_FMT_BGR32    ||   \
+         (x)==PIX_FMT_BGR32_1  ||   \
+         (x)==PIX_FMT_BGR24    ||   \
+         (x)==PIX_FMT_BGR565BE ||   \
+         (x)==PIX_FMT_BGR565LE ||   \
+         (x)==PIX_FMT_BGR555BE ||   \
+         (x)==PIX_FMT_BGR555LE ||   \
+         (x)==PIX_FMT_BGR444BE ||   \
+         (x)==PIX_FMT_BGR444LE ||   \
+         (x)==PIX_FMT_BGR8     ||   \
+         (x)==PIX_FMT_BGR4     ||   \
+         (x)==PIX_FMT_BGR4_BYTE||   \
+         (x)==PIX_FMT_MONOBLACK||   \
+         (x)==PIX_FMT_MONOWHITE   \
+    )
+
+#define isRGBinBytes(x) (           \
+           (x)==PIX_FMT_RGB48BE     \
+        || (x)==PIX_FMT_RGB48LE     \
+        || (x)==PIX_FMT_RGBA64BE    \
+        || (x)==PIX_FMT_RGBA64LE    \
+        || (x)==PIX_FMT_RGBA        \
+        || (x)==PIX_FMT_ARGB        \
+        || (x)==PIX_FMT_RGB24       \
+    )
+#define isBGRinBytes(x) (           \
+           (x)==PIX_FMT_BGR48BE     \
+        || (x)==PIX_FMT_BGR48LE     \
+        || (x)==PIX_FMT_BGRA64BE    \
+        || (x)==PIX_FMT_BGRA64LE    \
+        || (x)==PIX_FMT_BGRA        \
+        || (x)==PIX_FMT_ABGR        \
+        || (x)==PIX_FMT_BGR24       \
+    )
+
+#define isAnyRGB(x) \
+    (           \
+          isRGBinInt(x)       ||    \
+          isBGRinInt(x)       ||    \
+          (x)==PIX_FMT_GBR24P     \
+    )
 
 #define isALPHA(x)                                             \
     (av_pix_fmt_descriptors[x].nb_components == 2          ||  \
      av_pix_fmt_descriptors[x].nb_components == 4)
 
+#if 1
+#define isPacked(x)         (       \
+           (x)==PIX_FMT_PAL8        \
+        || (x)==PIX_FMT_YUYV422     \
+        || (x)==PIX_FMT_UYVY422     \
+        || (x)==PIX_FMT_Y400A       \
+        ||  isRGBinInt(x)           \
+        ||  isBGRinInt(x)           \
+    )
+#else
 #define isPacked(x)                                            \
     ((av_pix_fmt_descriptors[x].nb_components >= 2         &&  \
       !(av_pix_fmt_descriptors[x].flags & PIX_FMT_PLANAR)) ||  \
      (x) == PIX_FMT_PAL8)
 
+#endif
 #define isPlanar(x)                                            \
     (av_pix_fmt_descriptors[x].nb_components >= 2          &&  \
      (av_pix_fmt_descriptors[x].flags & PIX_FMT_PLANAR))
@@ -641,6 +695,9 @@ const char *sws_format_name(enum PixelFormat format);
 
 extern const uint64_t ff_dither4[2];
 extern const uint64_t ff_dither8[2];
+extern const uint8_t dithers[8][8][8];
+extern const uint16_t dither_scale[15][16];
+
 
 extern const AVClass sws_context_class;
 
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 595edf8ae1..307f4da8c9 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,7 +23,6 @@
 #include <math.h>
 #include <stdio.h>
 #include "config.h"
-#include <assert.h>
 #include "swscale.h"
 #include "swscale_internal.h"
 #include "rgb2rgb.h"
@@ -33,48 +32,7 @@
 #include "libavutil/mathematics.h"
 #include "libavutil/bswap.h"
 #include "libavutil/pixdesc.h"
-
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_1)[8][8] = {
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-    {   0,  1,  0,  1,  0,  1,  0,  1,},
-    {   1,  0,  1,  0,  1,  0,  1,  0,},
-};
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_3)[8][8] = {
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-    {   1,  2,  1,  2,  1,  2,  1,  2,},
-    {   3,  0,  3,  0,  3,  0,  3,  0,},
-};
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_64)[8][8] = {
-    {  18, 34, 30, 46, 17, 33, 29, 45,},
-    {  50,  2, 62, 14, 49,  1, 61, 13,},
-    {  26, 42, 22, 38, 25, 41, 21, 37,},
-    {  58, 10, 54,  6, 57,  9, 53,  5,},
-    {  16, 32, 28, 44, 19, 35, 31, 47,},
-    {  48,  0, 60, 12, 51,  3, 63, 15,},
-    {  24, 40, 20, 36, 27, 43, 23, 39,},
-    {  56,  8, 52,  4, 59, 11, 55,  7,},
-};
-extern const uint8_t dither_8x8_128[8][8];
-DECLARE_ALIGNED(8, const uint8_t, dither_8x8_256)[8][8] = {
-    {  72, 136, 120, 184,  68, 132, 116, 180,},
-    { 200,   8, 248,  56, 196,   4, 244,  52,},
-    { 104, 168,  88, 152, 100, 164,  84, 148,},
-    { 232,  40, 216,  24, 228,  36, 212,  20,},
-    {  64, 128, 102, 176,  76, 140, 124, 188,},
-    { 192,   0, 240,  48, 204,  12, 252,  60,},
-    {  96, 160,  80, 144, 108, 172,  92, 156,},
-    { 224,  32, 208,  16, 236,  44, 220,  28,},
-};
+#include "libavutil/avassert.h"
 
 #define RGB2YUV_SHIFT 15
 #define BY ( (int) (0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
@@ -87,6 +45,102 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_256)[8][8] = {
 #define RV ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
 #define RU (-(int) (0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
 
+DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
+{
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+  {   0,  1,  0,  1,  0,  1,  0,  1,},
+  {   1,  0,  1,  0,  1,  0,  1,  0,},
+},{
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+  {   1,  2,  1,  2,  1,  2,  1,  2,},
+  {   3,  0,  3,  0,  3,  0,  3,  0,},
+},{
+  {   2,  4,  3,  5,  2,  4,  3,  5,},
+  {   6,  0,  7,  1,  6,  0,  7,  1,},
+  {   3,  5,  2,  4,  3,  5,  2,  4,},
+  {   7,  1,  6,  0,  7,  1,  6,  0,},
+  {   2,  4,  3,  5,  2,  4,  3,  5,},
+  {   6,  0,  7,  1,  6,  0,  7,  1,},
+  {   3,  5,  2,  4,  3,  5,  2,  4,},
+  {   7,  1,  6,  0,  7,  1,  6,  0,},
+},{
+  {   4,  8,  7, 11,  4,  8,  7, 11,},
+  {  12,  0, 15,  3, 12,  0, 15,  3,},
+  {   6, 10,  5,  9,  6, 10,  5,  9,},
+  {  14,  2, 13,  1, 14,  2, 13,  1,},
+  {   4,  8,  7, 11,  4,  8,  7, 11,},
+  {  12,  0, 15,  3, 12,  0, 15,  3,},
+  {   6, 10,  5,  9,  6, 10,  5,  9,},
+  {  14,  2, 13,  1, 14,  2, 13,  1,},
+},{
+  {   9, 17, 15, 23,  8, 16, 14, 22,},
+  {  25,  1, 31,  7, 24,  0, 30,  6,},
+  {  13, 21, 11, 19, 12, 20, 10, 18,},
+  {  29,  5, 27,  3, 28,  4, 26,  2,},
+  {   8, 16, 14, 22,  9, 17, 15, 23,},
+  {  24,  0, 30,  6, 25,  1, 31,  7,},
+  {  12, 20, 10, 18, 13, 21, 11, 19,},
+  {  28,  4, 26,  2, 29,  5, 27,  3,},
+},{
+  {  18, 34, 30, 46, 17, 33, 29, 45,},
+  {  50,  2, 62, 14, 49,  1, 61, 13,},
+  {  26, 42, 22, 38, 25, 41, 21, 37,},
+  {  58, 10, 54,  6, 57,  9, 53,  5,},
+  {  16, 32, 28, 44, 19, 35, 31, 47,},
+  {  48,  0, 60, 12, 51,  3, 63, 15,},
+  {  24, 40, 20, 36, 27, 43, 23, 39,},
+  {  56,  8, 52,  4, 59, 11, 55,  7,},
+},{
+  {  18, 34, 30, 46, 17, 33, 29, 45,},
+  {  50,  2, 62, 14, 49,  1, 61, 13,},
+  {  26, 42, 22, 38, 25, 41, 21, 37,},
+  {  58, 10, 54,  6, 57,  9, 53,  5,},
+  {  16, 32, 28, 44, 19, 35, 31, 47,},
+  {  48,  0, 60, 12, 51,  3, 63, 15,},
+  {  24, 40, 20, 36, 27, 43, 23, 39,},
+  {  56,  8, 52,  4, 59, 11, 55,  7,},
+},{
+  {  36, 68, 60, 92, 34, 66, 58, 90,},
+  { 100,  4,124, 28, 98,  2,122, 26,},
+  {  52, 84, 44, 76, 50, 82, 42, 74,},
+  { 116, 20,108, 12,114, 18,106, 10,},
+  {  32, 64, 56, 88, 38, 70, 62, 94,},
+  {  96,  0,120, 24,102,  6,126, 30,},
+  {  48, 80, 40, 72, 54, 86, 46, 78,},
+  { 112, 16,104,  8,118, 22,110, 14,},
+}};
+
+static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
+
+const uint16_t dither_scale[15][16]={
+{    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
+{    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
+{    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
+{    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
+{    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
+{    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
+{    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
+{    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
+{    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
+{    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
+{    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
+{    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
+{    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
+{    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
+{    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
+};
+
+
 static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
                       uint8_t val)
 {
@@ -98,6 +152,20 @@ static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
     }
 }
 
+static void fillPlane16(uint8_t *plane, int stride, int width, int height, int y,
+                      int alpha, int bits)
+{
+    int i, j;
+    uint8_t *ptr = plane + stride * y;
+    int v = alpha ? -1 : (1<<bits);
+    for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j++) {
+            AV_WN16(ptr+2*j, v);
+        }
+        ptr += stride;
+    }
+}
+
 static void copyPlane(const uint8_t *src, int srcStride,
                       int srcSliceY, int srcSliceH, int width,
                       uint8_t *dst, int dstStride)
@@ -310,7 +378,7 @@ static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
     uint8_t *dstPtr = dst[0] + dstStride[0] * srcSliceY;
     const uint8_t *srcPtr = src[0];
 
-    if (srcFormat == PIX_FMT_Y400A) {
+    if (srcFormat == PIX_FMT_GRAY8A) {
         switch (dstFormat) {
         case PIX_FMT_RGB32  : conv = gray8aToPacked32; break;
         case PIX_FMT_BGR32  : conv = gray8aToPacked32; break;
@@ -332,7 +400,7 @@ static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
 
     if (!conv)
         av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
-               sws_format_name(srcFormat), sws_format_name(dstFormat));
+               av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
     else {
         for (i = 0; i < srcSliceH; i++) {
             conv(srcPtr, dstPtr, c->srcW, (uint8_t *) c->pal_rgb);
@@ -453,6 +521,13 @@ static int planarRgbToRgbWrapper(SwsContext *c, const uint8_t *src[],
         || (x) == PIX_FMT_ABGR   \
         )
 
+#define isRGB48(x) (                \
+           (x) == PIX_FMT_RGB48LE   \
+        || (x) == PIX_FMT_RGB48BE   \
+        || (x) == PIX_FMT_BGR48LE   \
+        || (x) == PIX_FMT_BGR48BE   \
+        )
+
 /* {RGB,BGR}{15,16,24,32,32_1} -> {RGB,BGR}{15,16,24,32} */
 typedef void (* rgbConvFn) (const uint8_t *, uint8_t *, int);
 static rgbConvFn findRgbConvFn(SwsContext *c)
@@ -486,6 +561,15 @@ static rgbConvFn findRgbConvFn(SwsContext *c)
               || CONV_IS(RGBA, BGRA)) conv = shuffle_bytes_2103;
         else if (CONV_IS(BGRA, ABGR)
               || CONV_IS(RGBA, ARGB)) conv = shuffle_bytes_3012;
+    } else if (isRGB48(srcFormat) && isRGB48(dstFormat)) {
+        if      (CONV_IS(RGB48LE, BGR48LE)
+              || CONV_IS(BGR48LE, RGB48LE)
+              || CONV_IS(RGB48BE, BGR48BE)
+              || CONV_IS(BGR48BE, RGB48BE)) conv = rgb48tobgr48_LL;
+        else if (CONV_IS(RGB48LE, BGR48BE)
+              || CONV_IS(BGR48LE, RGB48BE)
+              || CONV_IS(RGB48BE, BGR48LE)
+              || CONV_IS(BGR48BE, RGB48LE)) conv = rgb48tobgr48_LB;
     } else
     /* BGR -> BGR */
     if ((isBGRinInt(srcFormat) && isBGRinInt(dstFormat)) ||
@@ -544,7 +628,7 @@ static int rgbToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
 
     if (!conv) {
         av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
-               sws_format_name(srcFormat), sws_format_name(dstFormat));
+               av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
     } else {
         const uint8_t *srcPtr = src[0];
               uint8_t *dstPtr = dst[0];
@@ -623,7 +707,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
         while (length + c->srcW <= FFABS(dstStride[0]) &&
                length + c->srcW <= FFABS(srcStride[0]))
             length += c->srcW;
-        assert(length != 0);
+        av_assert1(length != 0);
 
         for (i = 0; i < srcSliceH; i++) {
             memcpy(dstPtr, srcPtr, length);
@@ -634,25 +718,25 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
     return srcSliceH;
 }
 
-#define clip9(x)  av_clip_uintp2(x,  9)
-#define clip10(x) av_clip_uintp2(x, 10)
-#define DITHER_COPY(dst, dstStride, wfunc, src, srcStride, rfunc, dithers, shift, clip) \
-    for (i = 0; i < height; i++) { \
-        const uint8_t *dither = dithers[i & 7]; \
-        for (j = 0; j < length - 7; j += 8) { \
-            wfunc(&dst[j + 0], clip((rfunc(&src[j + 0]) + dither[0]) >> shift)); \
-            wfunc(&dst[j + 1], clip((rfunc(&src[j + 1]) + dither[1]) >> shift)); \
-            wfunc(&dst[j + 2], clip((rfunc(&src[j + 2]) + dither[2]) >> shift)); \
-            wfunc(&dst[j + 3], clip((rfunc(&src[j + 3]) + dither[3]) >> shift)); \
-            wfunc(&dst[j + 4], clip((rfunc(&src[j + 4]) + dither[4]) >> shift)); \
-            wfunc(&dst[j + 5], clip((rfunc(&src[j + 5]) + dither[5]) >> shift)); \
-            wfunc(&dst[j + 6], clip((rfunc(&src[j + 6]) + dither[6]) >> shift)); \
-            wfunc(&dst[j + 7], clip((rfunc(&src[j + 7]) + dither[7]) >> shift)); \
-        } \
-        for (; j < length; j++) \
-            wfunc(&dst[j],     (rfunc(&src[j]) + dither[j & 7]) >> shift); \
-        dst += dstStride; \
-        src += srcStride; \
+#define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
+    uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
+    int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
+    for (i = 0; i < height; i++) {\
+        const uint8_t *dither= dithers[src_depth-9][i&7];\
+        for (j = 0; j < length-7; j+=8){\
+            dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\
+            dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\
+            dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\
+            dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\
+            dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\
+            dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\
+            dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\
+            dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\
+        }\
+        for (; j < length; j++)\
+            dst[j] = dbswap((bswap(src[j]) + dither[j&7])*scale>>shift);\
+        dst += dstStride;\
+        src += srcStride;\
     }
 
 static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
@@ -666,162 +750,99 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
         int height = (plane == 0 || plane == 3) ? srcSliceH: -((-srcSliceH) >> c->chrDstVSubSample);
         const uint8_t *srcPtr = src[plane];
         uint8_t *dstPtr = dst[plane] + dstStride[plane] * y;
+        int shiftonly= plane==1 || plane==2 || (!c->srcRange && plane==0);
 
         if (!dst[plane])
             continue;
         // ignore palette for GRAY8
         if (plane == 1 && !dst[2]) continue;
         if (!src[plane] || (plane == 1 && !src[2])) {
-            if (is16BPS(c->dstFormat))
-                length *= 2;
-            fillPlane(dst[plane], dstStride[plane], length, height, y,
-                      (plane == 3) ? 255 : 128);
+            if (is16BPS(c->dstFormat) || isNBPS(c->dstFormat)) {
+                fillPlane16(dst[plane], dstStride[plane], length, height, y,
+                        plane == 3, av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1);
+            } else {
+                fillPlane(dst[plane], dstStride[plane], length, height, y,
+                        (plane == 3) ? 255 : 128);
+            }
         } else {
-            if (is9_OR_10BPS(c->srcFormat)) {
+            if(isNBPS(c->srcFormat) || isNBPS(c->dstFormat)
+               || (is16BPS(c->srcFormat) != is16BPS(c->dstFormat))
+            ) {
                 const int src_depth = av_pix_fmt_descriptors[c->srcFormat].comp[plane].depth_minus1 + 1;
                 const int dst_depth = av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1 + 1;
                 const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
+                uint16_t *dstPtr2 = (uint16_t*)dstPtr;
 
-                if (is16BPS(c->dstFormat)) {
-                    uint16_t *dstPtr2 = (uint16_t *) dstPtr;
-#define COPY9_OR_10TO16(rfunc, wfunc) \
-                    for (i = 0; i < height; i++) { \
-                        for (j = 0; j < length; j++) { \
-                            int srcpx = rfunc(&srcPtr2[j]); \
-                            wfunc(&dstPtr2[j], (srcpx << (16 - src_depth)) | (srcpx >> (2 * src_depth - 16))); \
-                        } \
-                        dstPtr2 += dstStride[plane] / 2; \
-                        srcPtr2 += srcStride[plane] / 2; \
+                if (dst_depth == 8) {
+                    if(isBE(c->srcFormat) == HAVE_BIGENDIAN){
+                        DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, , )
+                    } else {
+                        DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, av_bswap16, )
                     }
-                    if (isBE(c->dstFormat)) {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO16(AV_RB16, AV_WB16);
-                        } else {
-                            COPY9_OR_10TO16(AV_RL16, AV_WB16);
+                } else if (src_depth == 8) {
+                    for (i = 0; i < height; i++) {
+                        #define COPY816(w)\
+                        if(shiftonly){\
+                            for (j = 0; j < length; j++)\
+                                w(&dstPtr2[j], srcPtr[j]<<(dst_depth-8));\
+                        }else{\
+                            for (j = 0; j < length; j++)\
+                                w(&dstPtr2[j], (srcPtr[j]<<(dst_depth-8)) |\
+                                               (srcPtr[j]>>(2*8-dst_depth)));\
                         }
-                    } else {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO16(AV_RB16, AV_WL16);
+                        if(isBE(c->dstFormat)){
+                            COPY816(AV_WB16)
                         } else {
-                            COPY9_OR_10TO16(AV_RL16, AV_WL16);
+                            COPY816(AV_WL16)
                         }
+                        dstPtr2 += dstStride[plane]/2;
+                        srcPtr  += srcStride[plane];
                     }
-                } else if (is9_OR_10BPS(c->dstFormat)) {
-                    uint16_t *dstPtr2 = (uint16_t *) dstPtr;
-#define COPY9_OR_10TO9_OR_10(loop) \
-                    for (i = 0; i < height; i++) { \
-                        for (j = 0; j < length; j++) { \
-                            loop; \
-                        } \
-                        dstPtr2 += dstStride[plane] / 2; \
-                        srcPtr2 += srcStride[plane] / 2; \
-                    }
-#define COPY9_OR_10TO9_OR_10_2(rfunc, wfunc) \
-                    if (dst_depth > src_depth) { \
-                        COPY9_OR_10TO9_OR_10(int srcpx = rfunc(&srcPtr2[j]); \
-                            wfunc(&dstPtr2[j], (srcpx << 1) | (srcpx >> 9))); \
-                    } else if (dst_depth < src_depth) { \
-                        DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_1, 1, clip9); \
-                    } else { \
-                        COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]))); \
-                    }
-                    if (isBE(c->dstFormat)) {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WB16);
-                        } else {
-                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WB16);
-                        }
-                    } else {
-                        if (isBE(c->srcFormat)) {
-                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WL16);
+                } else if (src_depth <= dst_depth) {
+                    for (i = 0; i < height; i++) {
+#define COPY_UP(r,w) \
+    if(shiftonly){\
+        for (j = 0; j < length; j++){ \
+            unsigned int v= r(&srcPtr2[j]);\
+            w(&dstPtr2[j], v<<(dst_depth-src_depth));\
+        }\
+    }else{\
+        for (j = 0; j < length; j++){ \
+            unsigned int v= r(&srcPtr2[j]);\
+            w(&dstPtr2[j], (v<<(dst_depth-src_depth)) | \
+                        (v>>(2*src_depth-dst_depth)));\
+        }\
+    }
+                        if(isBE(c->srcFormat)){
+                            if(isBE(c->dstFormat)){
+                                COPY_UP(AV_RB16, AV_WB16)
+                            } else {
+                                COPY_UP(AV_RB16, AV_WL16)
+                            }
                         } else {
-                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WL16);
+                            if(isBE(c->dstFormat)){
+                                COPY_UP(AV_RL16, AV_WB16)
+                            } else {
+                                COPY_UP(AV_RL16, AV_WL16)
+                            }
                         }
+                        dstPtr2 += dstStride[plane]/2;
+                        srcPtr2 += srcStride[plane]/2;
                     }
                 } else {
-#define W8(a, b) { *(a) = (b); }
-#define COPY9_OR_10TO8(rfunc) \
-                    if (src_depth == 9) { \
-                        DITHER_COPY(dstPtr,  dstStride[plane],   W8, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_1, 1, av_clip_uint8); \
-                    } else { \
-                        DITHER_COPY(dstPtr,  dstStride[plane],   W8, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_3, 2, av_clip_uint8); \
-                    }
-                    if (isBE(c->srcFormat)) {
-                        COPY9_OR_10TO8(AV_RB16);
-                    } else {
-                        COPY9_OR_10TO8(AV_RL16);
-                    }
-                }
-            } else if (is9_OR_10BPS(c->dstFormat)) {
-                const int dst_depth = av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1 + 1;
-                uint16_t *dstPtr2 = (uint16_t *) dstPtr;
-
-                if (is16BPS(c->srcFormat)) {
-                    const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
-#define COPY16TO9_OR_10(rfunc, wfunc) \
-                    if (dst_depth == 9) { \
-                        DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_128, 7, clip9); \
-                    } else { \
-                        DITHER_COPY(dstPtr2, dstStride[plane] / 2, wfunc, \
-                                    srcPtr2, srcStride[plane] / 2, rfunc, \
-                                    dither_8x8_64, 6, clip10); \
-                    }
-                    if (isBE(c->dstFormat)) {
-                        if (isBE(c->srcFormat)) {
-                            COPY16TO9_OR_10(AV_RB16, AV_WB16);
+                    if(isBE(c->srcFormat) == HAVE_BIGENDIAN){
+                        if(isBE(c->dstFormat) == HAVE_BIGENDIAN){
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , )
                         } else {
-                            COPY16TO9_OR_10(AV_RL16, AV_WB16);
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , av_bswap16)
                         }
-                    } else {
-                        if (isBE(c->srcFormat)) {
-                            COPY16TO9_OR_10(AV_RB16, AV_WL16);
+                    }else{
+                        if(isBE(c->dstFormat) == HAVE_BIGENDIAN){
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, )
                         } else {
-                            COPY16TO9_OR_10(AV_RL16, AV_WL16);
+                            DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, av_bswap16)
                         }
                     }
-                } else /* 8bit */ {
-#define COPY8TO9_OR_10(wfunc) \
-                    for (i = 0; i < height; i++) { \
-                        for (j = 0; j < length; j++) { \
-                            const int srcpx = srcPtr[j]; \
-                            wfunc(&dstPtr2[j], (srcpx << (dst_depth - 8)) | (srcpx >> (16 - dst_depth))); \
-                        } \
-                        dstPtr2 += dstStride[plane] / 2; \
-                        srcPtr  += srcStride[plane]; \
-                    }
-                    if (isBE(c->dstFormat)) {
-                        COPY8TO9_OR_10(AV_WB16);
-                    } else {
-                        COPY8TO9_OR_10(AV_WL16);
-                    }
-                }
-            } else if (is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) {
-                const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
-#define COPY16TO8(rfunc) \
-                    DITHER_COPY(dstPtr,  dstStride[plane],   W8, \
-                                srcPtr2, srcStride[plane] / 2, rfunc, \
-                                dither_8x8_256, 8, av_clip_uint8);
-                if (isBE(c->srcFormat)) {
-                    COPY16TO8(AV_RB16);
-                } else {
-                    COPY16TO8(AV_RL16);
-                }
-            } else if (!is16BPS(c->srcFormat) && is16BPS(c->dstFormat)) {
-                for (i = 0; i < height; i++) {
-                    for (j = 0; j < length; j++) {
-                        dstPtr[ j << 1     ] = srcPtr[j];
-                        dstPtr[(j << 1) + 1] = srcPtr[j];
-                    }
-                    srcPtr += srcStride[plane];
-                    dstPtr += dstStride[plane];
                 }
             } else if (is16BPS(c->srcFormat) && is16BPS(c->dstFormat) &&
                       isBE(c->srcFormat) != isBE(c->dstFormat)) {
@@ -899,28 +920,32 @@ void ff_get_unscaled_swscale(SwsContext *c)
         && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
         c->swScale= rgbToRgbWrapper;
 
-    if (isPlanarRGB(srcFormat) && isPackedRGB(dstFormat))
+#define isByteRGB(f) (\
+        f == PIX_FMT_RGB32   ||\
+        f == PIX_FMT_RGB32_1 ||\
+        f == PIX_FMT_RGB24   ||\
+        f == PIX_FMT_BGR32   ||\
+        f == PIX_FMT_BGR32_1 ||\
+        f == PIX_FMT_BGR24)
+
+    if (isAnyRGB(srcFormat) && isPlanar(srcFormat) && isByteRGB(dstFormat))
         c->swScale = planarRgbToRgbWrapper;
 
     /* bswap 16 bits per pixel/component packed formats */
     if (IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_BGR444) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_BGR48)  ||
+        IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_BGRA64) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_BGR555) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_BGR565) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_GRAY16) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_RGB444) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_RGB48)  ||
+        IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_RGBA64) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_RGB555) ||
         IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, PIX_FMT_RGB565))
         c->swScale = packed_16bpc_bswap;
 
-    if ((usePal(srcFormat) && (
-        dstFormat == PIX_FMT_RGB32   ||
-        dstFormat == PIX_FMT_RGB32_1 ||
-        dstFormat == PIX_FMT_RGB24   ||
-        dstFormat == PIX_FMT_BGR32   ||
-        dstFormat == PIX_FMT_BGR32_1 ||
-        dstFormat == PIX_FMT_BGR24)))
+    if (usePal(srcFormat) && isByteRGB(dstFormat))
         c->swScale = palToRgbWrapper;
 
     if (srcFormat == PIX_FMT_YUV422P) {
@@ -951,13 +976,14 @@ void ff_get_unscaled_swscale(SwsContext *c)
     if (srcFormat == PIX_FMT_UYVY422 && dstFormat == PIX_FMT_YUV422P)
         c->swScale = uyvyToYuv422Wrapper;
 
+#define isPlanarGray(x) (isGray(x) && (x) != PIX_FMT_GRAY8A)
     /* simple copy */
     if ( srcFormat == dstFormat ||
         (srcFormat == PIX_FMT_YUVA420P && dstFormat == PIX_FMT_YUV420P) ||
         (srcFormat == PIX_FMT_YUV420P && dstFormat == PIX_FMT_YUVA420P) ||
-        (isPlanarYUV(srcFormat) && isGray(dstFormat)) ||
-        (isPlanarYUV(dstFormat) && isGray(srcFormat)) ||
-        (isGray(dstFormat) && isGray(srcFormat)) ||
+        (isPlanarYUV(srcFormat) && isPlanarGray(dstFormat)) ||
+        (isPlanarYUV(dstFormat) && isPlanarGray(srcFormat)) ||
+        (isPlanarGray(dstFormat) && isPlanarGray(srcFormat)) ||
         (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat) &&
          c->chrDstHSubSample == c->chrSrcHSubSample &&
          c->chrDstVSubSample == c->chrSrcVSubSample &&
@@ -988,7 +1014,7 @@ static void reset_ptr(const uint8_t *src[], int format)
     }
 }
 
-static int check_image_pointers(uint8_t *data[4], enum PixelFormat pix_fmt,
+static int check_image_pointers(const uint8_t * const data[4], enum PixelFormat pix_fmt,
                                 const int linesizes[4])
 {
     const AVPixFmtDescriptor *desc = &av_pix_fmt_descriptors[pix_fmt];
@@ -1013,9 +1039,10 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
                                   int srcSliceH, uint8_t *const dst[],
                                   const int dstStride[])
 {
-    int i;
+    int i, ret;
     const uint8_t *src2[4] = { srcSlice[0], srcSlice[1], srcSlice[2], srcSlice[3] };
     uint8_t *dst2[4] = { dst[0], dst[1], dst[2], dst[3] };
+    uint8_t *rgb0_tmp = NULL;
 
     // do not mess up sliceDir if we have a "trailing" 0-size slice
     if (srcSliceH == 0)
@@ -1025,7 +1052,7 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
         av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
         return 0;
     }
-    if (!check_image_pointers(dst, c->dstFormat, dstStride)) {
+    if (!check_image_pointers((const uint8_t* const*)dst, c->dstFormat, dstStride)) {
         av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
         return 0;
     }
@@ -1040,9 +1067,10 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
 
     if (usePal(c->srcFormat)) {
         for (i = 0; i < 256; i++) {
-            int p, r, g, b, y, u, v;
+            int p, r, g, b, y, u, v, a = 0xff;
             if (c->srcFormat == PIX_FMT_PAL8) {
                 p = ((const uint32_t *)(srcSlice[1]))[i];
+                a = (p >> 24) & 0xFF;
                 r = (p >> 16) & 0xFF;
                 g = (p >>  8) & 0xFF;
                 b =  p        & 0xFF;
@@ -1058,11 +1086,10 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
                 r = ( i >> 3     ) * 255;
                 g = ((i >> 1) & 3) * 85;
                 b = ( i       & 1) * 255;
-            } else if (c->srcFormat == PIX_FMT_GRAY8 ||
-                      c->srcFormat == PIX_FMT_Y400A) {
+            } else if (c->srcFormat == PIX_FMT_GRAY8 || c->srcFormat == PIX_FMT_GRAY8A) {
                 r = g = b = i;
             } else {
-                assert(c->srcFormat == PIX_FMT_BGR4_BYTE);
+                av_assert1(c->srcFormat == PIX_FMT_BGR4_BYTE);
                 b = ( i >> 3     ) * 255;
                 g = ((i >> 1) & 3) * 85;
                 r = ( i       & 1) * 255;
@@ -1070,37 +1097,51 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
             y = av_clip_uint8((RY * r + GY * g + BY * b + ( 33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
             u = av_clip_uint8((RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
             v = av_clip_uint8((RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-            c->pal_yuv[i] = y + (u << 8) + (v << 16);
+            c->pal_yuv[i]= y + (u<<8) + (v<<16) + (a<<24);
 
             switch (c->dstFormat) {
             case PIX_FMT_BGR32:
 #if !HAVE_BIGENDIAN
             case PIX_FMT_RGB24:
 #endif
-                c->pal_rgb[i] =  r + (g << 8) + (b << 16);
+                c->pal_rgb[i]=  r + (g<<8) + (b<<16) + (a<<24);
                 break;
             case PIX_FMT_BGR32_1:
 #if HAVE_BIGENDIAN
             case PIX_FMT_BGR24:
 #endif
-                c->pal_rgb[i] = (r + (g << 8) + (b << 16)) << 8;
+                c->pal_rgb[i]= a + (r<<8) + (g<<16) + (b<<24);
                 break;
             case PIX_FMT_RGB32_1:
 #if HAVE_BIGENDIAN
             case PIX_FMT_RGB24:
 #endif
-                c->pal_rgb[i] = (b + (g << 8) + (r << 16)) << 8;
+                c->pal_rgb[i]= a + (b<<8) + (g<<16) + (r<<24);
                 break;
             case PIX_FMT_RGB32:
 #if !HAVE_BIGENDIAN
             case PIX_FMT_BGR24:
 #endif
             default:
-                c->pal_rgb[i] =  b + (g << 8) + (r << 16);
+                c->pal_rgb[i]=  b + (g<<8) + (r<<16) + (a<<24);
             }
         }
     }
 
+    if (c->src0Alpha && !c->dst0Alpha && isALPHA(c->dstFormat)) {
+        uint8_t *base;
+        int x,y;
+        rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32);
+        base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp;
+        for (y=0; y<srcSliceH; y++){
+            memcpy(base + srcStride[0]*y, src2[0] + srcStride[0]*y, 4*c->srcW);
+            for (x=c->src0Alpha-1; x<4*c->srcW; x+=4) {
+                base[ srcStride[0]*y + x] = 0xFF;
+            }
+        }
+        src2[0] = base;
+    }
+
     // copy strides, so they can safely be modified
     if (c->sliceDir == 1) {
         // slices go from top to bottom
@@ -1110,13 +1151,13 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
                               dstStride[3] };
 
         reset_ptr(src2, c->srcFormat);
-        reset_ptr((const uint8_t **) dst2, c->dstFormat);
+        reset_ptr((void*)dst2, c->dstFormat);
 
         /* reset slice direction at end of frame */
         if (srcSliceY + srcSliceH == c->srcH)
             c->sliceDir = 0;
 
-        return c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2,
+        ret = c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2,
                           dstStride2);
     } else {
         // slices go from bottom to top => we flip the image internally
@@ -1136,15 +1177,18 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
         dst2[3] += ( c->dstH                         - 1) * dstStride[3];
 
         reset_ptr(src2, c->srcFormat);
-        reset_ptr((const uint8_t **) dst2, c->dstFormat);
+        reset_ptr((void*)dst2, c->dstFormat);
 
         /* reset slice direction at end of frame */
         if (!srcSliceY)
             c->sliceDir = 0;
 
-        return c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH,
+        ret = c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH,
                           srcSliceH, dst2, dstStride2);
     }
+
+    av_free(rgb0_tmp);
+    return ret;
 }
 
 /* Convert the palette to the same packed 32-bit format as the palette */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index d8fee588a0..9508413664 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1,27 +1,27 @@
 /*
  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "config.h"
 
 #define _SVID_SOURCE // needed for MAP_ANONYMOUS
-#include <assert.h>
+#define _DARWIN_C_SOURCE // needed for MAP_ANON
 #include <inttypes.h>
 #include <math.h>
 #include <stdio.h>
@@ -37,6 +37,7 @@
 #include <windows.h>
 #endif
 
+#include "libavutil/avassert.h"
 #include "libavutil/avutil.h"
 #include "libavutil/bswap.h"
 #include "libavutil/cpu.h"
@@ -51,18 +52,19 @@
 
 unsigned swscale_version(void)
 {
+    av_assert0(LIBSWSCALE_VERSION_MICRO >= 100);
     return LIBSWSCALE_VERSION_INT;
 }
 
 const char *swscale_configuration(void)
 {
-    return LIBAV_CONFIGURATION;
+    return FFMPEG_CONFIGURATION;
 }
 
 const char *swscale_license(void)
 {
 #define LICENSE_PREFIX "libswscale license: "
-    return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1;
+    return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 }
 
 #define RET 0xC3 // near return opcode for x86
@@ -101,13 +103,21 @@ static const FormatEntry format_entries[PIX_FMT_NB] = {
     [PIX_FMT_RGBA]        = { 1, 1 },
     [PIX_FMT_ABGR]        = { 1, 1 },
     [PIX_FMT_BGRA]        = { 1, 1 },
+    [PIX_FMT_0RGB]        = { 1, 1 },
+    [PIX_FMT_RGB0]        = { 1, 1 },
+    [PIX_FMT_0BGR]        = { 1, 1 },
+    [PIX_FMT_BGR0]        = { 1, 1 },
     [PIX_FMT_GRAY16BE]    = { 1, 1 },
     [PIX_FMT_GRAY16LE]    = { 1, 1 },
     [PIX_FMT_YUV440P]     = { 1, 1 },
     [PIX_FMT_YUVJ440P]    = { 1, 1 },
     [PIX_FMT_YUVA420P]    = { 1, 1 },
+    [PIX_FMT_YUVA422P]    = { 1, 1 },
+    [PIX_FMT_YUVA444P]    = { 1, 1 },
     [PIX_FMT_RGB48BE]     = { 1, 1 },
     [PIX_FMT_RGB48LE]     = { 1, 1 },
+    [PIX_FMT_RGBA64BE]    = { 1, 0 },
+    [PIX_FMT_RGBA64LE]    = { 1, 0 },
     [PIX_FMT_RGB565BE]    = { 1, 1 },
     [PIX_FMT_RGB565LE]    = { 1, 1 },
     [PIX_FMT_RGB555BE]    = { 1, 1 },
@@ -129,6 +139,8 @@ static const FormatEntry format_entries[PIX_FMT_NB] = {
     [PIX_FMT_Y400A]       = { 1, 0 },
     [PIX_FMT_BGR48BE]     = { 1, 1 },
     [PIX_FMT_BGR48LE]     = { 1, 1 },
+    [PIX_FMT_BGRA64BE]    = { 0, 0 },
+    [PIX_FMT_BGRA64LE]    = { 0, 0 },
     [PIX_FMT_YUV420P9BE]  = { 1, 1 },
     [PIX_FMT_YUV420P9LE]  = { 1, 1 },
     [PIX_FMT_YUV420P10BE] = { 1, 1 },
@@ -164,13 +176,12 @@ int sws_isSupportedOutput(enum PixelFormat pix_fmt)
 
 extern const int32_t ff_yuv2rgb_coeffs[8][4];
 
+#if FF_API_SWS_FORMAT_NAME
 const char *sws_format_name(enum PixelFormat format)
 {
-    if ((unsigned)format < PIX_FMT_NB && av_pix_fmt_descriptors[format].name)
-        return av_pix_fmt_descriptors[format].name;
-    else
-        return "Unknown format";
+    return av_get_pix_fmt_name(format);
 }
+#endif
 
 static double getSplineCoeff(double a, double b, double c, double d,
                              double dist)
@@ -189,7 +200,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
                       int *outFilterSize, int xInc, int srcW, int dstW,
                       int filterAlign, int one, int flags, int cpu_flags,
                       SwsVector *srcFilter, SwsVector *dstFilter,
-                      double param[2], int is_horizontal)
+                      double param[2])
 {
     int i;
     int filterSize;
@@ -217,7 +228,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
         }
     } else if (flags & SWS_POINT) { // lame looking point sampling mode
         int i;
-        int xDstInSrc;
+        int64_t xDstInSrc;
         filterSize = 1;
         FF_ALLOC_OR_GOTO(NULL, filter,
                          dstW * sizeof(*filter) * filterSize, fail);
@@ -233,7 +244,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
     } else if ((xInc <= (1 << 16) && (flags & SWS_AREA)) ||
                (flags & SWS_FAST_BILINEAR)) { // bilinear upscale
         int i;
-        int xDstInSrc;
+        int64_t xDstInSrc;
         filterSize = 2;
         FF_ALLOC_OR_GOTO(NULL, filter,
                          dstW * sizeof(*filter) * filterSize, fail);
@@ -246,8 +257,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
             (*filterPos)[i] = xx;
             // bilinear upscale / linear interpolate / area averaging
             for (j = 0; j < filterSize; j++) {
-                int64_t coeff = fone - FFABS((xx << 16) - xDstInSrc) *
-                                (fone >> 16);
+                int64_t coeff= fone - FFABS(((int64_t)xx<<16) - xDstInSrc)*(fone>>16);
                 if (coeff < 0)
                     coeff = 0;
                 filter[i * filterSize + j] = coeff;
@@ -276,8 +286,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
         else if (flags & SWS_BILINEAR)
             sizeFactor = 2;
         else {
-            sizeFactor = 0;     // GCC warning killer
-            assert(0);
+            av_assert0(0);
         }
 
         if (xInc <= 1 << 16)
@@ -376,8 +385,7 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
                     double p = -2.196152422706632;
                     coeff = getSplineCoeff(1.0, 0.0, p, -p - 1.0, floatd) * fone;
                 } else {
-                    coeff = 0.0; // GCC warning killer
-                    assert(0);
+                    av_assert0(0);
                 }
 
                 filter[i * filterSize + j] = coeff;
@@ -390,13 +398,13 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
     /* apply src & dst Filter to filter -> filter2
      * av_free(filter);
      */
-    assert(filterSize > 0);
+    av_assert0(filterSize > 0);
     filter2Size = filterSize;
     if (srcFilter)
         filter2Size += srcFilter->length - 1;
     if (dstFilter)
         filter2Size += dstFilter->length - 1;
-    assert(filter2Size > 0);
+    av_assert0(filter2Size > 0);
     FF_ALLOCZ_OR_GOTO(NULL, filter2, filter2Size * dstW * sizeof(*filter2), fail);
 
     for (i = 0; i < dstW; i++) {
@@ -478,9 +486,9 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
             filterAlign = 1;
     }
 
-    assert(minFilterSize > 0);
+    av_assert0(minFilterSize > 0);
     filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));
-    assert(filterSize > 0);
+    av_assert0(filterSize > 0);
     filter = av_malloc(filterSize * dstW * sizeof(*filter));
     if (filterSize >= MAX_FILTER_SIZE * 16 /
                       ((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter)
@@ -508,29 +516,27 @@ static int initFilter(int16_t **outFilter, int32_t **filterPos,
     // FIXME try to align filterPos if possible
 
     // fix borders
-    if (is_horizontal) {
-        for (i = 0; i < dstW; i++) {
-            int j;
-            if ((*filterPos)[i] < 0) {
-                // move filter coefficients left to compensate for filterPos
-                for (j = 1; j < filterSize; j++) {
-                    int left = FFMAX(j + (*filterPos)[i], 0);
-                    filter[i * filterSize + left] += filter[i * filterSize + j];
-                    filter[i * filterSize + j]     = 0;
-                }
-                (*filterPos)[i] = 0;
+    for (i = 0; i < dstW; i++) {
+        int j;
+        if ((*filterPos)[i] < 0) {
+            // move filter coefficients left to compensate for filterPos
+            for (j = 1; j < filterSize; j++) {
+                int left = FFMAX(j + (*filterPos)[i], 0);
+                filter[i * filterSize + left] += filter[i * filterSize + j];
+                filter[i * filterSize + j]     = 0;
             }
+            (*filterPos)[i]= 0;
+        }
 
-            if ((*filterPos)[i] + filterSize > srcW) {
-                int shift = (*filterPos)[i] + filterSize - srcW;
-                // move filter coefficients right to compensate for filterPos
-                for (j = filterSize - 2; j >= 0; j--) {
-                    int right = FFMIN(j + shift, filterSize - 1);
-                    filter[i * filterSize + right] += filter[i * filterSize + j];
-                    filter[i * filterSize + j]      = 0;
-                }
-                (*filterPos)[i] = srcW - filterSize;
+        if ((*filterPos)[i] + filterSize > srcW) {
+            int shift = (*filterPos)[i] + filterSize - srcW;
+            // move filter coefficients right to compensate for filterPos
+            for (j = filterSize - 2; j >= 0; j--) {
+                int right = FFMIN(j + shift, filterSize - 1);
+                filter[i * filterSize + right] += filter[i * filterSize + j];
+                filter[i * filterSize + j]      = 0;
             }
+            (*filterPos)[i]= srcW - filterSize;
         }
     }
 
@@ -779,7 +785,7 @@ int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
                              int *srcRange, int **table, int *dstRange,
                              int *brightness, int *contrast, int *saturation)
 {
-    if (isYUV(c->dstFormat) || isGray(c->dstFormat))
+    if (!c || isYUV(c->dstFormat) || isGray(c->dstFormat))
         return -1;
 
     *inv_table  = c->srcColorspaceTable;
@@ -813,6 +819,17 @@ static int handle_jpeg(enum PixelFormat *format)
     }
 }
 
+static int handle_0alpha(enum PixelFormat *format)
+{
+    switch (*format) {
+    case PIX_FMT_0BGR    : *format = PIX_FMT_ABGR   ; return 1;
+    case PIX_FMT_BGR0    : *format = PIX_FMT_BGRA   ; return 4;
+    case PIX_FMT_0RGB    : *format = PIX_FMT_ARGB   ; return 1;
+    case PIX_FMT_RGB0    : *format = PIX_FMT_RGBA   ; return 4;
+    default:                                          return 0;
+    }
+}
+
 SwsContext *sws_alloc_context(void)
 {
     SwsContext *c = av_mallocz(sizeof(SwsContext));
@@ -825,7 +842,7 @@ SwsContext *sws_alloc_context(void)
 
 int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 {
-    int i;
+    int i, j;
     int usesVFilter, usesHFilter;
     int unscaled;
     SwsFilter dummyFilter = { NULL, NULL, NULL, NULL };
@@ -833,8 +850,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     int srcH              = c->srcH;
     int dstW              = c->dstW;
     int dstH              = c->dstH;
-    int dst_stride        = FFALIGN(dstW * sizeof(int16_t) + 16, 16);
-    int dst_stride_px     = dst_stride >> 1;
+    int dst_stride        = FFALIGN(dstW * sizeof(int16_t) + 66, 16);
     int flags, cpu_flags;
     enum PixelFormat srcFormat = c->srcFormat;
     enum PixelFormat dstFormat = c->dstFormat;
@@ -847,14 +863,25 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 
     unscaled = (srcW == dstW && srcH == dstH);
 
+    handle_jpeg(&srcFormat);
+    handle_jpeg(&dstFormat);
+    handle_0alpha(&srcFormat);
+    handle_0alpha(&dstFormat);
+
+    if(srcFormat!=c->srcFormat || dstFormat!=c->dstFormat){
+        av_log(c, AV_LOG_WARNING, "deprecated pixel format used, make sure you did set range correctly\n");
+        c->srcFormat= srcFormat;
+        c->dstFormat= dstFormat;
+    }
+
     if (!sws_isSupportedInput(srcFormat)) {
         av_log(c, AV_LOG_ERROR, "%s is not supported as input pixel format\n",
-               sws_format_name(srcFormat));
+               av_get_pix_fmt_name(srcFormat));
         return AVERROR(EINVAL);
     }
     if (!sws_isSupportedOutput(dstFormat)) {
         av_log(c, AV_LOG_ERROR, "%s is not supported as output pixel format\n",
-               sws_format_name(dstFormat));
+               av_get_pix_fmt_name(dstFormat));
         return AVERROR(EINVAL);
     }
 
@@ -870,8 +897,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
                  SWS_SPLINE        |
                  SWS_BICUBLIN);
     if (!i || (i & (i - 1))) {
-        av_log(c, AV_LOG_ERROR,
-               "Exactly one scaler algorithm must be chosen\n");
+        av_log(c, AV_LOG_ERROR, "Exactly one scaler algorithm must be chosen, got %X\n", i);
         return AVERROR(EINVAL);
     }
     /* sanity check */
@@ -906,6 +932,14 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
     getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
 
+
+    if (isAnyRGB(dstFormat) && !(flags&SWS_FULL_CHR_H_INT)) {
+        if (dstW&1) {
+            av_log(c, AV_LOG_DEBUG, "Forcing full internal H chroma due to odd output size\n");
+            flags |= SWS_FULL_CHR_H_INT;
+            c->flags = flags;
+        }
+    }
     /* reuse chroma for 2 pixels RGB/BGR unless user wants full
      * chroma interpolation */
     if (flags & SWS_FULL_CHR_H_INT &&
@@ -916,9 +950,9 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         dstFormat != PIX_FMT_ABGR  &&
         dstFormat != PIX_FMT_RGB24 &&
         dstFormat != PIX_FMT_BGR24) {
-        av_log(c, AV_LOG_ERROR,
+        av_log(c, AV_LOG_WARNING,
                "full chroma interpolation for destination format '%s' not yet implemented\n",
-               sws_format_name(dstFormat));
+               av_get_pix_fmt_name(dstFormat));
         flags   &= ~SWS_FULL_CHR_H_INT;
         c->flags = flags;
     }
@@ -955,7 +989,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             if (flags & SWS_PRINT_INFO)
                 av_log(c, AV_LOG_INFO,
                        "using unscaled %s -> %s special converter\n",
-                       sws_format_name(srcFormat), sws_format_name(dstFormat));
+                       av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
             return 0;
         }
     }
@@ -966,11 +1000,11 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     c->dstBpc = 1 + av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1;
     if (c->dstBpc < 8)
         c->dstBpc = 8;
+    if (isAnyRGB(srcFormat) || srcFormat == PIX_FMT_PAL8)
+        c->srcBpc = 16;
     if (c->dstBpc == 16)
         dst_stride <<= 1;
-    FF_ALLOC_OR_GOTO(c, c->formatConvBuffer,
-                     (FFALIGN(srcW, 16) * 2 * FFALIGN(c->srcBpc, 8) >> 3) + 16,
-                     fail);
+    FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail);
     if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2 &&
         c->srcBpc == 8 && c->dstBpc <= 10) {
         c->canMMX2BeUsed = (dstW >= srcW && (dstW & 31) == 0 &&
@@ -981,8 +1015,8 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
                 av_log(c, AV_LOG_INFO,
                        "output width is not a multiple of 32 -> no MMX2 scaler\n");
         }
-        if (usesHFilter)
-            c->canMMX2BeUsed = 0;
+        if (usesHFilter || isNBPS(c->srcFormat) || is16BPS(c->srcFormat) || isAnyRGB(c->srcFormat))
+            c->canMMX2BeUsed=0;
     } else
         c->canMMX2BeUsed = 0;
 
@@ -1002,7 +1036,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             c->chrXInc += 20;
         }
         // we don't use the x86 asm scaler if MMX is available
-        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
+        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX && c->dstBpc <= 10) {
             c->lumXInc = ((int64_t)(srcW       - 2) << 16) / (dstW       - 2) - 20;
             c->chrXInc = ((int64_t)(c->chrSrcW - 2) << 16) / (c->chrDstW - 2) - 20;
         }
@@ -1029,17 +1063,25 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             c->chrMmx2FilterCode = av_malloc(c->chrMmx2FilterCodeSize);
 #endif
 
+#ifdef MAP_ANONYMOUS
+            if (c->lumMmx2FilterCode == MAP_FAILED || c->chrMmx2FilterCode == MAP_FAILED)
+#else
             if (!c->lumMmx2FilterCode || !c->chrMmx2FilterCode)
+#endif
+            {
+                av_log(c, AV_LOG_ERROR, "Failed to allocate MMX2FilterCode\n");
                 return AVERROR(ENOMEM);
+            }
+
             FF_ALLOCZ_OR_GOTO(c, c->hLumFilter,    (dstW           / 8 + 8) * sizeof(int16_t), fail);
             FF_ALLOCZ_OR_GOTO(c, c->hChrFilter,    (c->chrDstW     / 4 + 8) * sizeof(int16_t), fail);
             FF_ALLOCZ_OR_GOTO(c, c->hLumFilterPos, (dstW       / 2 / 8 + 8) * sizeof(int32_t), fail);
             FF_ALLOCZ_OR_GOTO(c, c->hChrFilterPos, (c->chrDstW / 2 / 4 + 8) * sizeof(int32_t), fail);
 
-            initMMX2HScaler(dstW, c->lumXInc, c->lumMmx2FilterCode,
-                            c->hLumFilter, c->hLumFilterPos, 8);
+            initMMX2HScaler(      dstW, c->lumXInc, c->lumMmx2FilterCode,
+                            c->hLumFilter, (uint32_t*)c->hLumFilterPos, 8);
             initMMX2HScaler(c->chrDstW, c->chrXInc, c->chrMmx2FilterCode,
-                            c->hChrFilter, c->hChrFilterPos, 4);
+                            c->hChrFilter, (uint32_t*)c->hChrFilterPos, 4);
 
 #ifdef MAP_ANONYMOUS
             mprotect(c->lumMmx2FilterCode, c->lumMmx2FilterCodeSize, PROT_EXEC | PROT_READ);
@@ -1058,14 +1100,14 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
                            srcW, dstW, filterAlign, 1 << 14,
                            (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
                            cpu_flags, srcFilter->lumH, dstFilter->lumH,
-                           c->param, 1) < 0)
+                           c->param) < 0)
                 goto fail;
             if (initFilter(&c->hChrFilter, &c->hChrFilterPos,
                            &c->hChrFilterSize, c->chrXInc,
                            c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
                            (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
                            cpu_flags, srcFilter->chrH, dstFilter->chrH,
-                           c->param, 1) < 0)
+                           c->param) < 0)
                 goto fail;
         }
     } // initialize horizontal stuff
@@ -1081,14 +1123,14 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
                        c->lumYInc, srcH, dstH, filterAlign, (1 << 12),
                        (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
                        cpu_flags, srcFilter->lumV, dstFilter->lumV,
-                       c->param, 0) < 0)
+                       c->param) < 0)
             goto fail;
         if (initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize,
                        c->chrYInc, c->chrSrcH, c->chrDstH,
                        filterAlign, (1 << 12),
                        (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
                        cpu_flags, srcFilter->chrV, dstFilter->chrV,
-                       c->param, 0) < 0)
+                       c->param) < 0)
             goto fail;
 
 #if HAVE_ALTIVEC
@@ -1145,9 +1187,9 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
                           dst_stride + 16, fail);
         c->lumPixBuf[i] = c->lumPixBuf[i + c->vLumBufSize];
     }
-    // 64 / (c->dstBpc & ~7) is the same as 16 / sizeof(scaling_intermediate)
-    c->uv_off_px   = dst_stride_px + 64 / (c->dstBpc & ~7);
-    c->uv_off_byte = dst_stride + 16;
+    // 64 / c->scalingBpp is the same as 16 / sizeof(scaling_intermediate)
+    c->uv_off   = (dst_stride>>1) + 64 / (c->dstBpc &~ 7);
+    c->uv_offx2 = dst_stride + 16;
     for (i = 0; i < c->vChrBufSize; i++) {
         FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i + c->vChrBufSize],
                          dst_stride * 2 + 32, fail);
@@ -1164,9 +1206,15 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 
     // try to avoid drawing green stuff between the right end and the stride end
     for (i = 0; i < c->vChrBufSize; i++)
-        memset(c->chrUPixBuf[i], 64, dst_stride * 2 + 1);
+        if(av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 == 15){
+            av_assert0(c->dstBpc > 10);
+            for(j=0; j<dst_stride/2+1; j++)
+                ((int32_t*)(c->chrUPixBuf[i]))[j] = 1<<18;
+        } else
+            for(j=0; j<dst_stride+1; j++)
+                ((int16_t*)(c->chrUPixBuf[i]))[j] = 1<<14;
 
-    assert(c->chrDstH <= dstH);
+    av_assert0(c->chrDstH <= dstH);
 
     if (flags & SWS_PRINT_INFO) {
         if (flags & SWS_FAST_BILINEAR)
@@ -1195,7 +1243,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             av_log(c, AV_LOG_INFO, "ehh flags invalid?! ");
 
         av_log(c, AV_LOG_INFO, "from %s to %s%s ",
-               sws_format_name(srcFormat),
+               av_get_pix_fmt_name(srcFormat),
 #ifdef DITHER1XBPP
                dstFormat == PIX_FMT_BGR555   || dstFormat == PIX_FMT_BGR565   ||
                dstFormat == PIX_FMT_RGB444BE || dstFormat == PIX_FMT_RGB444LE ||
@@ -1204,7 +1252,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 #else
                "",
 #endif
-               sws_format_name(dstFormat));
+               av_get_pix_fmt_name(dstFormat));
 
         if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2)
             av_log(c, AV_LOG_INFO, "using MMX2\n");
@@ -1251,6 +1299,8 @@ SwsContext *sws_getContext(int srcW, int srcH, enum PixelFormat srcFormat,
     c->dstH      = dstH;
     c->srcRange  = handle_jpeg(&srcFormat);
     c->dstRange  = handle_jpeg(&dstFormat);
+    c->src0Alpha = handle_0alpha(&srcFormat);
+    c->dst0Alpha = handle_0alpha(&dstFormat);
     c->srcFormat = srcFormat;
     c->dstFormat = dstFormat;
 
@@ -1639,7 +1689,7 @@ void sws_freeContext(SwsContext *c)
 #endif /* HAVE_MMX */
 
     av_freep(&c->yuvTable);
-    av_free(c->formatConvBuffer);
+    av_freep(&c->formatConvBuffer);
 
     av_free(c);
 }
@@ -1678,10 +1728,12 @@ struct SwsContext *sws_getCachedContext(struct SwsContext *context, int srcW,
         context->srcW      = srcW;
         context->srcH      = srcH;
         context->srcRange  = handle_jpeg(&srcFormat);
+        context->src0Alpha = handle_0alpha(&srcFormat);
         context->srcFormat = srcFormat;
         context->dstW      = dstW;
         context->dstH      = dstH;
         context->dstRange  = handle_jpeg(&dstFormat);
+        context->dst0Alpha = handle_0alpha(&dstFormat);
         context->dstFormat = dstFormat;
         context->flags     = flags;
         context->param[0]  = param[0];
diff --git a/libswscale/version.h b/libswscale/version.h
index 32bb2f5887..ef0e0298f6 100644
--- a/libswscale/version.h
+++ b/libswscale/version.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,7 +28,7 @@
 
 #define LIBSWSCALE_VERSION_MAJOR 2
 #define LIBSWSCALE_VERSION_MINOR 1
-#define LIBSWSCALE_VERSION_MICRO 0
+#define LIBSWSCALE_VERSION_MICRO 100
 
 #define LIBSWSCALE_VERSION_INT  AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
                                                LIBSWSCALE_VERSION_MINOR, \
@@ -52,5 +52,8 @@
 #ifndef FF_API_SWS_CPU_CAPS
 #define FF_API_SWS_CPU_CAPS    (LIBSWSCALE_VERSION_MAJOR < 3)
 #endif
+#ifndef FF_API_SWS_FORMAT_NAME
+#define FF_API_SWS_FORMAT_NAME  (LIBSWSCALE_VERSION_MAJOR < 3)
+#endif
 
 #endif /* SWSCALE_VERSION_H */
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 7f3779983d..68d5381701 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -1,3 +1,5 @@
+$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+
 OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 
 MMX-OBJS                        += x86/rgb2rgb.o                        \
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
index 66d88458bb..c4174ee65e 100644
--- a/libswscale/x86/input.asm
+++ b/libswscale/x86/input.asm
@@ -36,8 +36,8 @@ SECTION_RODATA
 %define GV 0xD0E3
 %define BV 0xF6E4
 
-rgb_Yrnd:        times 4 dd 0x84000        ;  16.5 << 15
-rgb_UVrnd:       times 4 dd 0x404000       ; 128.5 << 15
+rgb_Yrnd:        times 4 dd 0x80100        ;  16.5 << 15
+rgb_UVrnd:       times 4 dd 0x400100       ; 128.5 << 15
 bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
 bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
 rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
@@ -83,7 +83,7 @@ SECTION .text
 ; %1 = nr. of XMM registers
 ; %2 = rgb or bgr
 %macro RGB24_TO_Y_FN 2-3
-cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
+cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, u3
 %if mmsize == 8
     mova           m5, [%2_Ycoeff_12x4]
     mova           m6, [%2_Ycoeff_3x56]
@@ -115,6 +115,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
 %if ARCH_X86_64
     movsxd         wq, wd
 %endif
+    add            wq, wq
     add          dstq, wq
     neg            wq
 %if notcpuflag(ssse3)
@@ -158,12 +159,11 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
     paddd          m2, m3                 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
     paddd          m0, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
     paddd          m2, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
-    psrad          m0, 15
-    psrad          m2, 15
+    psrad          m0, 9
+    psrad          m2, 9
     packssdw       m0, m2                 ; (word) { Y[0-7] }
-    packuswb       m0, m0                 ; (byte) { Y[0-7] }
-    movh    [dstq+wq], m0
-    add            wq, mmsize / 2
+    mova    [dstq+wq], m0
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
@@ -172,7 +172,7 @@ cglobal %2 %+ 24ToY, 3, 3, %1, dst, src, w
 ; %1 = nr. of XMM registers
 ; %2 = rgb or bgr
 %macro RGB24_TO_UV_FN 2-3
-cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
 %if ARCH_X86_64
     mova           m8, [%2_Ucoeff_12x4]
     mova           m9, [%2_Ucoeff_3x56]
@@ -203,10 +203,11 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
 %endif ; x86-32/64
 %endif ; cpuflag(ssse3)
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
+    add            wq, wq
     add         dstUq, wq
     add         dstVq, wq
     neg            wq
@@ -264,23 +265,20 @@ cglobal %2 %+ 24ToUV, 3, 4, %1, dstU, dstV, src, w
     paddd          m2, m6                 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
     paddd          m1, m6                 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
     paddd          m4, m6                 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
-    psrad          m0, 15
-    psrad          m2, 15
-    psrad          m1, 15
-    psrad          m4, 15
+    psrad          m0, 9
+    psrad          m2, 9
+    psrad          m1, 9
+    psrad          m4, 9
     packssdw       m0, m1                 ; (word) { U[0-7] }
     packssdw       m2, m4                 ; (word) { V[0-7] }
 %if mmsize == 8
-    packuswb       m0, m0                 ; (byte) { U[0-3] }
-    packuswb       m2, m2                 ; (byte) { V[0-3] }
-    movh   [dstUq+wq], m0
-    movh   [dstVq+wq], m2
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %else ; mmsize == 16
-    packuswb       m0, m2                 ; (byte) { U[0-7], V[0-7] }
-    movh   [dstUq+wq], m0
-    movhps [dstVq+wq], m0
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %endif ; mmsize == 8/16
-    add            wq, mmsize / 2
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; ARCH_X86_64 && %0 == 3
@@ -306,13 +304,15 @@ RGB24_FUNCS 10, 12
 INIT_XMM ssse3
 RGB24_FUNCS 11, 13
 
+%if HAVE_AVX
 INIT_XMM avx
 RGB24_FUNCS 11, 13
+%endif
 
 ; %1 = nr. of XMM registers
 ; %2-5 = rgba, bgra, argb or abgr (in individual characters)
 %macro RGB32_TO_Y_FN 5-6
-cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, u3
     mova           m5, [rgba_Ycoeff_%2%4]
     mova           m6, [rgba_Ycoeff_%3%5]
 %if %0 == 6
@@ -323,6 +323,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
     movsxd         wq, wd
 %endif
     lea          srcq, [srcq+wq*4]
+    add            wq, wq
     add          dstq, wq
     neg            wq
     mova           m4, [rgb_Yrnd]
@@ -330,8 +331,8 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
     psrlw          m7, 8                  ; (word) { 0x00ff } x4
 .loop:
     ; FIXME check alignment and use mova
-    movu           m0, [srcq+wq*4+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
-    movu           m2, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
     DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
     pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
     pmaddwd        m0, m6                 ; (dword) { Gx*GY }[0-3]
@@ -341,12 +342,11 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
     paddd          m2, m4                 ; += rgb_Yrnd
     paddd          m0, m1                 ; (dword) { Y[0-3] }
     paddd          m2, m3                 ; (dword) { Y[4-7] }
-    psrad          m0, 15
-    psrad          m2, 15
+    psrad          m0, 9
+    psrad          m2, 9
     packssdw       m0, m2                 ; (word) { Y[0-7] }
-    packuswb       m0, m0                 ; (byte) { Y[0-7] }
-    movh    [dstq+wq], m0
-    add            wq, mmsize / 2
+    mova    [dstq+wq], m0
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; %0 == 3
@@ -355,7 +355,7 @@ cglobal %2%3%4%5 %+ ToY, 3, 3, %1, dst, src, w
 ; %1 = nr. of XMM registers
 ; %2-5 = rgba, bgra, argb or abgr (in individual characters)
 %macro RGB32_TO_UV_FN 5-6
-cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, u3
 %if ARCH_X86_64
     mova           m8, [rgba_Ucoeff_%2%4]
     mova           m9, [rgba_Ucoeff_%3%5]
@@ -376,21 +376,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
 %else ; ARCH_X86_64 && %0 == 6
 .body:
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
+    add            wq, wq
     add         dstUq, wq
     add         dstVq, wq
-    lea          srcq, [srcq+wq*4]
+    lea          srcq, [srcq+wq*2]
     neg            wq
     pcmpeqb        m7, m7
     psrlw          m7, 8                  ; (word) { 0x00ff } x4
     mova           m6, [rgb_UVrnd]
 .loop:
     ; FIXME check alignment and use mova
-    movu           m0, [srcq+wq*4+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
-    movu           m4, [srcq+wq*4+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
     DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
     pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
     pmaddwd        m2, m0, coeffV2        ; (dword) { Gx*GV }[0-3]
@@ -406,25 +407,22 @@ cglobal %2%3%4%5 %+ ToUV, 3, 4, %1, dstU, dstV, src, w
     pmaddwd        m4, coeffU2            ; (dword) { Gx*GU }[4-7]
     paddd          m3, m6                 ; += rgb_UVrnd
     paddd          m5, m6                 ; += rgb_UVrnd
-    psrad          m0, 15
+    psrad          m0, 9
     paddd          m1, m3                 ; (dword) { V[4-7] }
     paddd          m4, m5                 ; (dword) { U[4-7] }
-    psrad          m2, 15
-    psrad          m4, 15
-    psrad          m1, 15
+    psrad          m2, 9
+    psrad          m4, 9
+    psrad          m1, 9
     packssdw       m0, m4                 ; (word) { U[0-7] }
     packssdw       m2, m1                 ; (word) { V[0-7] }
 %if mmsize == 8
-    packuswb       m0, m0                 ; (byte) { U[0-7] }
-    packuswb       m2, m2                 ; (byte) { V[0-7] }
-    movh   [dstUq+wq], m0
-    movh   [dstVq+wq], m2
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %else ; mmsize == 16
-    packuswb       m0, m2                 ; (byte) { U[0-7], V[0-7] }
-    movh   [dstUq+wq], m0
-    movhps [dstVq+wq], m0
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
 %endif ; mmsize == 8/16
-    add            wq, mmsize / 2
+    add            wq, mmsize
     jl .loop
     REP_RET
 %endif ; ARCH_X86_64 && %0 == 3
@@ -452,8 +450,10 @@ RGB32_FUNCS 0, 0
 INIT_XMM sse2
 RGB32_FUNCS 8, 12
 
+%if HAVE_AVX
 INIT_XMM avx
 RGB32_FUNCS 8, 12
+%endif
 
 ;-----------------------------------------------------------------------------
 ; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
@@ -490,7 +490,7 @@ RGB32_FUNCS 8, 12
 ;      will be the same (i.e. YUYV+AVX), and thus we don't need to
 ;      split the loop in an aligned and unaligned case
 %macro YUYV_TO_Y_FN 2-3
-cglobal %2ToY, 3, 3, %1, dst, src, w
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
 %if ARCH_X86_64
     movsxd         wq, wd
 %endif
@@ -560,11 +560,11 @@ cglobal %2ToY, 3, 3, %1, dst, src, w
 ;      will be the same (i.e. UYVY+AVX), and thus we don't need to
 ;      split the loop in an aligned and unaligned case
 %macro YUYV_TO_UV_FN 2-3
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
     add         dstUq, wq
     add         dstVq, wq
@@ -594,8 +594,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
 .loop_%1:
     mov%1          m0, [srcq+wq*2]        ; (byte) { U0, V0, U1, V1, ... }
     mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
-    pand           m2, m0, m4             ; (word) { U0, U1, ..., U7 }
-    pand           m3, m1, m4             ; (word) { U8, U9, ..., U15 }
+    pand           m2, m0, m5             ; (word) { U0, U1, ..., U7 }
+    pand           m3, m1, m5             ; (word) { U8, U9, ..., U15 }
     psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
     psrlw          m1, 8                  ; (word) { V8, V9, ..., V15 }
     packuswb       m2, m3                 ; (byte) { U0, ..., U15 }
@@ -615,11 +615,11 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
 ; %1 = nr. of XMM registers
 ; %2 = nv12 or nv21
 %macro NVXX_TO_UV_FN 2
-cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
 %if ARCH_X86_64
-    movsxd         wq, dword r4m
+    movsxd         wq, dword r5m
 %else ; x86-32
-    mov            wq, r4m
+    mov            wq, r5m
 %endif
     add         dstUq, wq
     add         dstVq, wq
@@ -627,8 +627,8 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
     test         srcq, 15
 %endif
     lea          srcq, [srcq+wq*2]
-    pcmpeqb        m4, m4                 ; (byte) { 0xff } x 16
-    psrlw          m4, 8                  ; (word) { 0x00ff } x 8
+    pcmpeqb        m5, m5                 ; (byte) { 0xff } x 16
+    psrlw          m5, 8                  ; (word) { 0x00ff } x 8
 %if mmsize == 16
     jnz .loop_u_start
     neg            wq
@@ -660,6 +660,7 @@ YUYV_TO_UV_FN 3, uyvy
 NVXX_TO_UV_FN 5, nv12
 NVXX_TO_UV_FN 5, nv21
 
+%if HAVE_AVX
 INIT_XMM avx
 ; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
 ; that's not faster in practice
@@ -667,3 +668,4 @@ YUYV_TO_UV_FN 3, yuyv
 YUYV_TO_UV_FN 3, uyvy, 1
 NVXX_TO_UV_FN 5, nv12
 NVXX_TO_UV_FN 5, nv21
+%endif
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index 9b0b01253a..01a946f7b3 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -267,10 +267,12 @@ yuv2planeX_fn  9,  7, 5
 yuv2planeX_fn 10,  7, 5
 yuv2planeX_fn 16,  8, 5
 
+%if HAVE_AVX
 INIT_XMM avx
 yuv2planeX_fn  8, 10, 7
 yuv2planeX_fn  9,  7, 5
 yuv2planeX_fn 10,  7, 5
+%endif
 
 ; %1=outout-bpc, %2=alignment (u/a)
 %macro yuv2plane1_mainloop 2
@@ -405,8 +407,10 @@ yuv2plane1_fn 16, 6, 3
 INIT_XMM sse4
 yuv2plane1_fn 16, 5, 3
 
+%if HAVE_AVX
 INIT_XMM avx
 yuv2plane1_fn  8, 5, 5
 yuv2plane1_fn  9, 5, 3
 yuv2plane1_fn 10, 5, 3
 yuv2plane1_fn 16, 5, 3
+%endif
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 282618c301..9359f0b1f3 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -6,20 +6,20 @@
  * Written by Nick Kurshev.
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -68,6 +68,9 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
 DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
 DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
 DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_mid)    = 0x4200420042004200ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_hi)     = 0x0210021002100210ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;
 
 #define RGB2YUV_SHIFT 8
 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c
index c255610193..bb97a0b4a1 100644
--- a/libswscale/x86/rgb2rgb_template.c
+++ b/libswscale/x86/rgb2rgb_template.c
@@ -7,20 +7,20 @@
  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  * lot of big-endian byte order fixes by Alex Beregszaszi
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -801,27 +801,6 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_s
     }
 }
 
-/*
-  I use less accurate approximation here by simply left-shifting the input
-  value and filling the low order bits with zeroes. This method improves PNG
-  compression but this scheme cannot reproduce white exactly, since it does
-  not generate an all-ones maximum value; the net effect is to darken the
-  image slightly.
-
-  The better method should be "left bit replication":
-
-   4 3 2 1 0
-   ---------
-   1 1 0 1 1
-
-   7 6 5 4 3  2 1 0
-   ----------------
-   1 1 0 1 1  1 1 0
-   |=======|  |===|
-       |      leftmost bits repeated to fill open bits
-       |
-   original bits
-*/
 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
 {
     const uint16_t *end;
@@ -840,9 +819,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $2, %%mm1    \n\t"
-            "psrlq         $7, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -870,9 +850,10 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $2, %%mm1    \n\t"
-            "psrlq         $7, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -892,7 +873,7 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "por        %%mm5, %%mm3    \n\t"
 
             :"=m"(*d)
-            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
+            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r),"m"(mmx_null)
             :"memory");
         /* borrowed 32 to 24 */
         __asm__ volatile(
@@ -919,9 +900,9 @@ static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int sr
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x7C00)>>7;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
     }
 }
 
@@ -943,9 +924,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $3, %%mm1    \n\t"
-            "psrlq         $8, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -973,9 +956,11 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $3, %%mm1    \n\t"
-            "psrlq         $8, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             "movq       %%mm0, %%mm3    \n\t"
             "movq       %%mm1, %%mm4    \n\t"
             "movq       %%mm2, %%mm5    \n\t"
@@ -1021,9 +1006,9 @@ static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int sr
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0xF800)>>8;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
     }
 }
 
@@ -1066,12 +1051,13 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $2, %%mm1    \n\t"
-            "psrlq         $7, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "pmulhw        %5, %%mm0    \n\t"
+            "pmulhw        %5, %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             PACK_RGB32
             :"=m"(*d)
-            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
+            :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r),"m"(mul15_mid)
             :"memory");
         d += 16;
         s += 4;
@@ -1081,9 +1067,9 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_s
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x3E0)>>2;
-        *d++ = (bgr&0x7C00)>>7;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
         *d++ = 255;
     }
 }
@@ -1108,12 +1094,14 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
             "pand          %2, %%mm0    \n\t"
             "pand          %3, %%mm1    \n\t"
             "pand          %4, %%mm2    \n\t"
-            "psllq         $3, %%mm0    \n\t"
-            "psrlq         $3, %%mm1    \n\t"
-            "psrlq         $8, %%mm2    \n\t"
+            "psllq         $5, %%mm0    \n\t"
+            "psrlq         $1, %%mm2    \n\t"
+            "pmulhw        %5, %%mm0    \n\t"
+            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
+            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
             PACK_RGB32
             :"=m"(*d)
-            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
+            :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
             :"memory");
         d += 16;
         s += 4;
@@ -1123,9 +1111,9 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
     while (s < end) {
         register uint16_t bgr;
         bgr = *s++;
-        *d++ = (bgr&0x1F)<<3;
-        *d++ = (bgr&0x7E0)>>3;
-        *d++ = (bgr&0xF800)>>8;
+        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
+        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
         *d++ = 255;
     }
 }
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index d56e253afa..a3cb0b1bd6 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -408,11 +408,13 @@ SCALE_FUNC %1, %2, X, X8, 7, %3
 SCALE_FUNCS  8, 15, %1
 SCALE_FUNCS  9, 15, %2
 SCALE_FUNCS 10, 15, %2
+SCALE_FUNCS 14, 15, %2
 SCALE_FUNCS 16, 15, %3
 %endif ; !sse4
 SCALE_FUNCS  8, 19, %1
 SCALE_FUNCS  9, 19, %2
 SCALE_FUNCS 10, 19, %2
+SCALE_FUNCS 14, 19, %2
 SCALE_FUNCS 16, 19, %3
 %endmacro
 
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index 37d786a87d..9f7aa80e2b 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -1,20 +1,20 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -66,6 +66,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset)  = 0x1010101010101010ULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
 
+
 //MMX versions
 #if HAVE_MMX
 #undef RENAME
@@ -113,9 +114,9 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
         c->greenDither= ff_dither4[dstY&1];
     c->redDither= ff_dither8[(dstY+1)&1];
     if (dstY < dstH - 2) {
-        const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-        const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-        const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+        const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+        const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+        const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
         int i;
 
         if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
@@ -199,6 +200,67 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
     }
 }
 
+#if HAVE_MMX2
+static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    if(((int)dest) & 15){
+        return yuv2yuvX_MMX2(filter, filterSize, src, dest, dstW, dither, offset);
+    }
+    if (offset) {
+        __asm__ volatile("movq       (%0), %%xmm3\n\t"
+                         "movdqa    %%xmm3, %%xmm4\n\t"
+                         "psrlq       $24, %%xmm3\n\t"
+                         "psllq       $40, %%xmm4\n\t"
+                         "por       %%xmm4, %%xmm3\n\t"
+                         :: "r"(dither)
+                         );
+    } else {
+        __asm__ volatile("movq       (%0), %%xmm3\n\t"
+                         :: "r"(dither)
+                         );
+    }
+    __asm__ volatile(
+        "pxor      %%xmm0, %%xmm0\n\t"
+        "punpcklbw %%xmm0, %%xmm3\n\t"
+        "psraw        $4, %%xmm3\n\t"
+        "movdqa    %%xmm3, %%xmm4\n\t"
+        "movdqa    %%xmm3, %%xmm7\n\t"
+        "movl %3, %%ecx\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        ".p2align                             4             \n\t" /* FIXME Unroll? */\
+        "1:                                                 \n\t"\
+        "movddup                  8(%%"REG_d"), %%xmm0      \n\t" /* filterCoeff */\
+        "movdqa              (%%"REG_S", %%"REG_c", 2), %%xmm2      \n\t" /* srcData */\
+        "movdqa            16(%%"REG_S", %%"REG_c", 2), %%xmm5      \n\t" /* srcData */\
+        "add                                $16, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "test                         %%"REG_S", %%"REG_S"  \n\t"\
+        "pmulhw                           %%xmm0, %%xmm2      \n\t"\
+        "pmulhw                           %%xmm0, %%xmm5      \n\t"\
+        "paddw                            %%xmm2, %%xmm3      \n\t"\
+        "paddw                            %%xmm5, %%xmm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%xmm3      \n\t"\
+        "psraw                               $3, %%xmm4      \n\t"\
+        "packuswb                         %%xmm4, %%xmm3      \n\t"
+        "movntdq                          %%xmm3, (%1, %%"REG_c")\n\t"
+        "add                         $16, %%"REG_c"         \n\t"\
+        "cmp                          %2, %%"REG_c"         \n\t"\
+        "movdqa    %%xmm7, %%xmm3\n\t"
+        "movdqa    %%xmm7, %%xmm4\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "jb                                  1b             \n\t"\
+        :: "g" (filter),
+           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+        : "%"REG_d, "%"REG_S, "%"REG_c
+    );
+}
+#endif
+
 #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
 extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
                                                 SwsContext *c, int16_t *data, \
@@ -210,10 +272,12 @@ extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt(
     SCALE_FUNC(filter_n,  8, 15, opt); \
     SCALE_FUNC(filter_n,  9, 15, opt); \
     SCALE_FUNC(filter_n, 10, 15, opt); \
+    SCALE_FUNC(filter_n, 14, 15, opt); \
     SCALE_FUNC(filter_n, 16, 15, opt); \
     SCALE_FUNC(filter_n,  8, 19, opt); \
     SCALE_FUNC(filter_n,  9, 19, opt); \
     SCALE_FUNC(filter_n, 10, 19, opt); \
+    SCALE_FUNC(filter_n, 14, 19, opt); \
     SCALE_FUNC(filter_n, 16, 19, opt)
 
 #define SCALE_FUNCS_MMX(opt) \
@@ -305,6 +369,10 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
 #if HAVE_MMX2
     if (cpu_flags & AV_CPU_FLAG_MMX2)
         sws_init_swScale_MMX2(c);
+    if (cpu_flags & AV_CPU_FLAG_SSE3){
+        if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
+            c->yuv2planeX = yuv2yuvX_sse3;
+    }
 #endif
 
 #if HAVE_YASM
@@ -318,7 +386,10 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
     } else if (c->srcBpc == 10) { \
         hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
                                      ff_hscale10to19_ ## filtersize ## _ ## opt1; \
-    } else /* c->srcBpc == 16 */ { \
+    } else if (c->srcBpc == 14 || ((c->srcFormat==PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)) { \
+        hscalefn = c->dstBpc <= 10 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale14to19_ ## filtersize ## _ ## opt1; \
+    } else { /* c->srcBpc == 16 */ \
         hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
                                      ff_hscale16to19_ ## filtersize ## _ ## opt1; \
     } \
@@ -334,7 +405,7 @@ switch(c->dstBpc){ \
     case 16:                          do_16_case;                          break; \
     case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
     case 9:  if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_  ## opt; break; \
-    default: if (condition_8bit)      vscalefn = ff_yuv2planeX_8_  ## opt; break; \
+    default: if (condition_8bit)    /*vscalefn = ff_yuv2planeX_8_  ## opt;*/ break; \
     }
 #define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
     switch(c->dstBpc){ \
@@ -455,7 +526,7 @@ switch(c->dstBpc){ \
             c->yuv2plane1 = ff_yuv2plane1_16_sse4;
     }
 
-    if (cpu_flags & AV_CPU_FLAG_AVX) {
+    if (HAVE_AVX && cpu_flags & AV_CPU_FLAG_AVX) {
         ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
                             HAVE_ALIGNED_STACK || ARCH_X86_64);
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index ad2b32f27b..cf8e802229 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -1,25 +1,26 @@
 /*
- * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #undef REAL_MOVNTQ
 #undef MOVNTQ
+#undef MOVNTQ2
 #undef PREFETCH
 
 #if COMPILE_TEMPLATE_MMX2
@@ -30,11 +31,84 @@
 
 #if COMPILE_TEMPLATE_MMX2
 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movntq "
 #else
 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
+#define MOVNTQ2 "movq "
 #endif
 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
 
+#if !COMPILE_TEMPLATE_MMX2
+static av_always_inline void
+dither_8to16(const uint8_t *srcDither, int rot)
+{
+    if (rot) {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "psrlq       $24, %%mm3\n\t"
+                         "psllq       $40, %%mm4\n\t"
+                         "por       %%mm4, %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         :: "r"(srcDither)
+                         );
+    } else {
+        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
+                         "movq       (%0), %%mm3\n\t"
+                         "movq      %%mm3, %%mm4\n\t"
+                         "punpcklbw %%mm0, %%mm3\n\t"
+                         "punpckhbw %%mm0, %%mm4\n\t"
+                         :: "r"(srcDither)
+                         );
+    }
+}
+#endif
+
+static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    dither_8to16(dither, offset);
+    __asm__ volatile(\
+        "psraw        $4, %%mm3\n\t"
+        "psraw        $4, %%mm4\n\t"
+        "movq    %%mm3, %%mm6\n\t"
+        "movq    %%mm4, %%mm7\n\t"
+        "movl %3, %%ecx\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        ".p2align                             4             \n\t" /* FIXME Unroll? */\
+        "1:                                                 \n\t"\
+        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
+        "movq                (%%"REG_S", %%"REG_c", 2), %%mm2      \n\t" /* srcData */\
+        "movq               8(%%"REG_S", %%"REG_c", 2), %%mm5      \n\t" /* srcData */\
+        "add                                $16, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "test                         %%"REG_S", %%"REG_S"  \n\t"\
+        "pmulhw                           %%mm0, %%mm2      \n\t"\
+        "pmulhw                           %%mm0, %%mm5      \n\t"\
+        "paddw                            %%mm2, %%mm3      \n\t"\
+        "paddw                            %%mm5, %%mm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%mm3      \n\t"\
+        "psraw                               $3, %%mm4      \n\t"\
+        "packuswb                         %%mm4, %%mm3      \n\t"
+        MOVNTQ2 "                         %%mm3, (%1, %%"REG_c")\n\t"
+        "add                          $8, %%"REG_c"         \n\t"\
+        "cmp                          %2, %%"REG_c"         \n\t"\
+        "movq    %%mm6, %%mm3\n\t"
+        "movq    %%mm7, %%mm4\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "jb                                  1b             \n\t"\
+        :: "g" (filter),
+           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+        : "%"REG_d, "%"REG_S, "%"REG_c
+    );
+}
+
 #define YSCALEYUV2PACKEDX_UV \
     __asm__ volatile(\
         "xor                   %%"REG_a", %%"REG_a"     \n\t"\
@@ -260,7 +334,7 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
         YSCALEYUV2PACKEDX_ACCURATE
@@ -293,7 +367,7 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
         YSCALEYUV2PACKEDX
@@ -350,7 +424,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -374,7 +448,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -427,7 +501,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -451,7 +525,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -584,7 +658,7 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     YSCALEYUV2RGBX
@@ -608,7 +682,7 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     YSCALEYUV2RGBX
@@ -649,7 +723,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX_ACCURATE
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -670,7 +744,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    x86_reg uv_off = c->uv_off_byte;
+    x86_reg uv_off = c->uv_offx2;
 
     YSCALEYUV2PACKEDX
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -786,8 +860,8 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
             : "%r8"
         );
 #else
-        *(const uint16_t **)(&c->u_temp)=abuf0;
-        *(const uint16_t **)(&c->v_temp)=abuf1;
+        c->u_temp=(intptr_t)abuf0;
+        c->v_temp=(intptr_t)abuf1;
         __asm__ volatile(
             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
             "mov        %4, %%"REG_b"               \n\t"
@@ -1559,9 +1633,9 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
 {
     enum PixelFormat dstFormat = c->dstFormat;
 
-    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
-        dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
-        if (!(c->flags & SWS_BITEXACT)) {
+    c->use_mmx_vfilter= 0;
+    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12
+        && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
             if (c->flags & SWS_ACCURATE_RND) {
                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
                     switch (c->dstFormat) {
@@ -1574,6 +1648,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
                     }
                 }
             } else {
+                c->use_mmx_vfilter= 1;
+                c->yuv2planeX = RENAME(yuv2yuvX    );
                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
                     switch (c->dstFormat) {
                     case PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
@@ -1585,7 +1661,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
                     }
                 }
             }
-        }
         if (!(c->flags & SWS_FULL_CHR_H_INT)) {
             switch (c->dstFormat) {
             case PIX_FMT_RGB32:
diff --git a/libswscale/x86/yuv2rgb_mmx.c b/libswscale/x86/yuv2rgb_mmx.c
index 0eaea77485..fd184eb71d 100644
--- a/libswscale/x86/yuv2rgb_mmx.c
+++ b/libswscale/x86/yuv2rgb_mmx.c
@@ -7,20 +7,20 @@
  * 1,4,8bpp support and context / deglobalize stuff
  * by Michael Niedermayer (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -68,10 +68,6 @@ SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (c->srcFormat != PIX_FMT_YUV420P &&
-        c->srcFormat != PIX_FMT_YUVA420P)
-        return NULL;
-
 #if HAVE_MMX2
     if (cpu_flags & AV_CPU_FLAG_MMX2) {
         switch (c->dstFormat) {
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index 5d1fa5b309..624de14252 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -4,20 +4,20 @@
  * Copyright (C) 2001-2007 Michael Niedermayer
  *           (c) 2010 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -43,17 +43,14 @@
     if (h_size * depth > FFABS(dstStride[0]))                        \
         h_size -= 8;                                                 \
                                                                      \
-    if (c->srcFormat == PIX_FMT_YUV422P) {                           \
-        srcStride[1] *= 2;                                           \
-        srcStride[2] *= 2;                                           \
-    }                                                                \
+    vshift = c->srcFormat != PIX_FMT_YUV422P;                        \
                                                                      \
     __asm__ volatile ("pxor %mm4, %mm4\n\t");                        \
     for (y = 0; y < srcSliceH; y++) {                                \
         uint8_t *image    = dst[0] + (y + srcSliceY) * dstStride[0]; \
         const uint8_t *py = src[0] +               y * srcStride[0]; \
-        const uint8_t *pu = src[1] +        (y >> 1) * srcStride[1]; \
-        const uint8_t *pv = src[2] +        (y >> 1) * srcStride[2]; \
+        const uint8_t *pu = src[1] +   (y >> vshift) * srcStride[1]; \
+        const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
         x86_reg index = -h_size / 2;                                 \
 
 #define YUV2RGB_INITIAL_LOAD          \
@@ -141,6 +138,7 @@
         : "+r" (index), "+r" (image)                              \
         : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
           "r" (py - 2*index)                                      \
+        : "memory"                                                \
         );                                                        \
     }                                                             \
 
@@ -148,6 +146,7 @@
         : "+r" (index), "+r" (image)                              \
         : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
           "r" (py - 2*index), "r" (pa - 2*index)                  \
+        : "memory"                                                \
         );                                                        \
     }                                                             \
 
@@ -188,7 +187,7 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(2)
 
@@ -216,7 +215,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(2)
 
@@ -306,7 +305,7 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(3)
 
@@ -324,7 +323,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(3)
 
@@ -368,7 +367,7 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
@@ -389,7 +388,7 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
                                         int srcSliceY, int srcSliceH,
                                         uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
@@ -411,7 +410,7 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
                                        int srcSliceY, int srcSliceH,
                                        uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
@@ -432,7 +431,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
                                         int srcSliceY, int srcSliceH,
                                         uint8_t *dst[], int dstStride[])
 {
-    int y, h_size;
+    int y, h_size, vshift;
 
     YUV2RGB_LOOP(4)
 
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 1c44a2f544..021a38f414 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -6,27 +6,26 @@
  * 1,4,8bpp support and context / deglobalize stuff
  * by Michael Niedermayer (michaelni@gmx.at)
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <inttypes.h>
-#include <assert.h>
 
 #include "libavutil/cpu.h"
 #include "libavutil/bswap.h"
@@ -34,6 +33,7 @@
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"
+#include "libavutil/pixdesc.h"
 
 extern const uint8_t dither_4x4_16[4][8];
 extern const uint8_t dither_8x8_32[8][8];
@@ -61,9 +61,9 @@ const int *sws_getCoefficients(int colorspace)
 #define LOADCHROMA(i)                               \
     U = pu[i];                                      \
     V = pv[i];                                      \
-    r = (void *)c->table_rV[V];                     \
-    g = (void *)(c->table_gU[U] + c->table_gV[V]);  \
-    b = (void *)c->table_bU[U];
+    r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM];                     \
+    g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]);  \
+    b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
 
 #define PUTRGB(dst, src, i)                         \
     Y              = src[2 * i];                    \
@@ -505,7 +505,7 @@ CLOSEYUV2RGBFUNC(8)
 YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
     const uint8_t *d128 = dither_8x8_220[y & 7];
     char out_1 = 0, out_2 = 0;
-    g = c->table_gU[128] + c->table_gV[128];
+    g = c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM];
 
 #define PUTRGB1(out, src, i, o)                     \
     Y    = src[2 * i];                              \
@@ -547,7 +547,7 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
 
     av_log(c, AV_LOG_WARNING,
            "No accelerated colorspace conversion found from %s to %s.\n",
-           sws_format_name(c->srcFormat), sws_format_name(c->dstFormat));
+           av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
 
     switch (c->dstFormat) {
     case PIX_FMT_BGR48BE:
@@ -558,14 +558,11 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
         return yuv2rgb_c_48;
     case PIX_FMT_ARGB:
     case PIX_FMT_ABGR:
-        if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P)
+        if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat))
             return yuva2argb_c;
     case PIX_FMT_RGBA:
     case PIX_FMT_BGRA:
-        if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P)
-            return yuva2rgba_c;
-        else
-            return yuv2rgb_c_32;
+        return (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) ? yuva2rgba_c : yuv2rgb_c_32;
     case PIX_FMT_RGB24:
         return yuv2rgb_c_24_rgb;
     case PIX_FMT_BGR24:
@@ -589,36 +586,32 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
         return yuv2rgb_c_4b_ordered_dither;
     case PIX_FMT_MONOBLACK:
         return yuv2rgb_c_1_ordered_dither;
-    default:
-        assert(0);
     }
     return NULL;
 }
 
-static void fill_table(uint8_t *table[256], const int elemsize,
+static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize,
                        const int inc, void *y_tab)
 {
     int i;
-    int64_t cb       = 0;
     uint8_t *y_table = y_tab;
 
     y_table -= elemsize * (inc >> 9);
 
-    for (i = 0; i < 256; i++) {
+    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
+        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
         table[i] = y_table + elemsize * (cb >> 16);
-        cb      += inc;
     }
 }
 
-static void fill_gv_table(int table[256], const int elemsize, const int inc)
+static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int inc)
 {
     int i;
-    int64_t cb = 0;
     int off    = -(inc >> 9);
 
-    for (i = 0; i < 256; i++) {
+    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
+        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
         table[i] = elemsize * (off + (cb >> 16));
-        cb      += inc;
     }
 }
 
@@ -661,7 +654,7 @@ av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
     uint8_t *y_table;
     uint16_t *y_table16;
     uint32_t *y_table32;
-    int i, base, rbase, gbase, bbase, abase, needAlpha;
+    int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha;
     const int yoffs = fullRange ? 384 : 326;
 
     int64_t crv =  inv_table[0];