12 files changed, 643 insertions, 441 deletions
diff --git a/libswscale/Makefile b/libswscale/Makefile
index 78d0112c8e..77d896a76b 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -19,7 +19,8 @@ OBJS-$(HAVE_MMX)           +=  x86/rgb2rgb.o            \
                                x86/swscale_mmx.o        \
                                x86/yuv2rgb_mmx.o
 OBJS-$(HAVE_VIS)           +=  sparc/yuv2rgb_vis.o
-MMX-OBJS-$(HAVE_YASM)      +=  x86/output.o             \
+MMX-OBJS-$(HAVE_YASM)      +=  x86/input.o              \
+                               x86/output.o             \
                                x86/scale.o
 
 $(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
diff --git a/libswscale/colorspace-test.c b/libswscale/colorspace-test.c
index a5709e482e..89713a8a0c 100644
--- a/libswscale/colorspace-test.c
+++ b/libswscale/colorspace-test.c
@@ -27,19 +27,19 @@
 #include "swscale.h"
 #include "rgb2rgb.h"
 
-#define SIZE 1000
+#define SIZE    1000
 #define srcByte 0x55
 #define dstByte 0xBB
 
-#define FUNC(s,d,n) {s,d,#n,n}
+#define FUNC(s, d, n) { s, d, #n, n }
 
 int main(int argc, char **argv)
 {
     int i, funcNum;
     uint8_t *srcBuffer = av_malloc(SIZE);
     uint8_t *dstBuffer = av_malloc(SIZE);
-    int failedNum=0;
-    int passedNum=0;
+    int failedNum      = 0;
+    int passedNum      = 0;
 
     if (!srcBuffer || !dstBuffer)
         return -1;
@@ -47,7 +47,7 @@ int main(int argc, char **argv)
     av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
     sws_rgb2rgb_init();
 
-    for(funcNum=0; ; funcNum++) {
+    for (funcNum = 0; ; funcNum++) {
         struct func_info_s {
             int src_bpp;
             int dst_bpp;
@@ -85,67 +85,78 @@ int main(int argc, char **argv)
             FUNC(0, 0, NULL)
         };
         int width;
-        int failed=0;
-        int srcBpp=0;
-        int dstBpp=0;
+        int failed = 0;
+        int srcBpp = 0;
+        int dstBpp = 0;
 
-        if (!func_info[funcNum].func) break;
+        if (!func_info[funcNum].func)
+            break;
 
-        av_log(NULL, AV_LOG_INFO,".");
+        av_log(NULL, AV_LOG_INFO, ".");
         memset(srcBuffer, srcByte, SIZE);
 
-        for(width=63; width>0; width--) {
+        for (width = 63; width > 0; width--) {
             int dstOffset;
-            for(dstOffset=128; dstOffset<196; dstOffset+=4) {
+            for (dstOffset = 128; dstOffset < 196; dstOffset += 4) {
                 int srcOffset;
                 memset(dstBuffer, dstByte, SIZE);
 
-                for(srcOffset=128; srcOffset<196; srcOffset+=4) {
-                    uint8_t *src= srcBuffer+srcOffset;
-                    uint8_t *dst= dstBuffer+dstOffset;
-                    const char *name=NULL;
+                for (srcOffset = 128; srcOffset < 196; srcOffset += 4) {
+                    uint8_t *src     = srcBuffer + srcOffset;
+                    uint8_t *dst     = dstBuffer + dstOffset;
+                    const char *name = NULL;
 
-                    if(failed) break; //don't fill the screen with shit ...
+                    // don't fill the screen with shit ...
+                    if (failed)
+                        break;
 
                     srcBpp = func_info[funcNum].src_bpp;
                     dstBpp = func_info[funcNum].dst_bpp;
                     name   = func_info[funcNum].name;
 
-                    func_info[funcNum].func(src, dst, width*srcBpp);
+                    func_info[funcNum].func(src, dst, width * srcBpp);
 
-                    if(!srcBpp) break;
+                    if (!srcBpp)
+                        break;
 
-                    for(i=0; i<SIZE; i++) {
-                        if(srcBuffer[i]!=srcByte) {
-                            av_log(NULL, AV_LOG_INFO, "src damaged at %d w:%d src:%d dst:%d %s\n",
+                    for (i = 0; i < SIZE; i++) {
+                        if (srcBuffer[i] != srcByte) {
+                            av_log(NULL, AV_LOG_INFO,
+                                   "src damaged at %d w:%d src:%d dst:%d %s\n",
                                    i, width, srcOffset, dstOffset, name);
-                            failed=1;
+                            failed = 1;
                             break;
                         }
                     }
-                    for(i=0; i<dstOffset; i++) {
-                        if(dstBuffer[i]!=dstByte) {
-                            av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n",
+                    for (i = 0; i < dstOffset; i++) {
+                        if (dstBuffer[i] != dstByte) {
+                            av_log(NULL, AV_LOG_INFO,
+                                   "dst damaged at %d w:%d src:%d dst:%d %s\n",
                                    i, width, srcOffset, dstOffset, name);
-                            failed=1;
+                            failed = 1;
                             break;
                         }
                     }
-                    for(i=dstOffset + width*dstBpp; i<SIZE; i++) {
-                        if(dstBuffer[i]!=dstByte) {
-                            av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n",
+                    for (i = dstOffset + width * dstBpp; i < SIZE; i++) {
+                        if (dstBuffer[i] != dstByte) {
+                            av_log(NULL, AV_LOG_INFO,
+                                   "dst damaged at %d w:%d src:%d dst:%d %s\n",
                                    i, width, srcOffset, dstOffset, name);
-                            failed=1;
+                            failed = 1;
                             break;
                         }
                     }
                 }
             }
         }
-        if(failed) failedNum++;
-        else if(srcBpp) passedNum++;
+        if (failed)
+            failedNum++;
+        else if (srcBpp)
+            passedNum++;
     }
 
-    av_log(NULL, AV_LOG_INFO, "\n%d converters passed, %d converters randomly overwrote memory\n", passedNum, failedNum);
+    av_log(NULL, AV_LOG_INFO,
+           "\n%d converters passed, %d converters randomly overwrote memory\n",
+           passedNum, failedNum);
     return failedNum;
 }
diff --git a/libswscale/ppc/yuv2rgb_altivec.h b/libswscale/ppc/yuv2rgb_altivec.h
index 7c2a7e547b..8c62c322e7 100644
--- a/libswscale/ppc/yuv2rgb_altivec.h
+++ b/libswscale/ppc/yuv2rgb_altivec.h
@@ -24,13 +24,18 @@
 #ifndef SWSCALE_PPC_YUV2RGB_ALTIVEC_H
 #define SWSCALE_PPC_YUV2RGB_ALTIVEC_H
 
-#define YUV2PACKEDX_HEADER(suffix) \
-void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
-                            const int16_t **lumSrc, int lumFilterSize, \
-                            const int16_t *chrFilter, const int16_t **chrUSrc, \
-                            const int16_t **chrVSrc, int chrFilterSize, \
-                            const int16_t **alpSrc, uint8_t *dest, \
-                            int dstW, int dstY);
+#define YUV2PACKEDX_HEADER(suffix)                                  \
+    void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c,             \
+                                         const int16_t *lumFilter,  \
+                                         const int16_t **lumSrc,    \
+                                         int lumFilterSize,         \
+                                         const int16_t *chrFilter,  \
+                                         const int16_t **chrUSrc,   \
+                                         const int16_t **chrVSrc,   \
+                                         int chrFilterSize,         \
+                                         const int16_t **alpSrc,    \
+                                         uint8_t *dest,             \
+                                         int dstW, int dstY);
 
 YUV2PACKEDX_HEADER(abgr);
 YUV2PACKEDX_HEADER(bgra);
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index e3edac88d4..a7542cb211 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -36,32 +36,33 @@ extern void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
 extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
 extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
 extern void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
-extern void (*rgb32to16)   (const uint8_t *src, uint8_t *dst, int src_size);
-extern void (*rgb32to15)   (const uint8_t *src, uint8_t *dst, int src_size);
-extern void (*rgb15to16)   (const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
 extern void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
-extern void (*rgb15to32)   (const uint8_t *src, uint8_t *dst, int src_size);
-extern void (*rgb16to15)   (const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
 extern void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
-extern void (*rgb16to32)   (const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
 extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
-extern void (*rgb24to16)   (const uint8_t *src, uint8_t *dst, int src_size);
-extern void (*rgb24to15)   (const uint8_t *src, uint8_t *dst, int src_size);
-extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
 extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
 extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
 
-void rgb24to32   (const uint8_t *src, uint8_t *dst, int src_size);
-void rgb32to24   (const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
+
+void    rgb24to32(const uint8_t *src, uint8_t *dst, int src_size);
+void    rgb32to24(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
-void rgb16to24   (const uint8_t *src, uint8_t *dst, int src_size);
+void    rgb16to24(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
-void rgb15to24   (const uint8_t *src, uint8_t *dst, int src_size);
+void    rgb15to24(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
-void bgr8torgb8  (const uint8_t *src, uint8_t *dst, int src_size);
+void   bgr8torgb8(const uint8_t *src, uint8_t *dst, int src_size);
 
 void shuffle_bytes_0321(const uint8_t *src, uint8_t *dst, int src_size);
 void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size);
@@ -138,7 +139,6 @@ extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint
                             int srcStride1, int srcStride2,
                             int srcStride3, int dstStride);
 
-
 extern void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
                             int width, int height,
                             int lumStride, int chromStride, int srcStride);
diff --git a/libswscale/swscale-test.c b/libswscale/swscale-test.c
index 5dd2e34870..f382ce3d6a 100644
--- a/libswscale/swscale-test.c
+++ b/libswscale/swscale-test.c
@@ -35,33 +35,32 @@
 
 /* HACK Duplicated from swscale_internal.h.
  * Should be removed when a cleaner pixel format system exists. */
-#define isGray(x)       (           \
-           (x)==PIX_FMT_GRAY8       \
-        || (x)==PIX_FMT_GRAY16BE    \
-        || (x)==PIX_FMT_GRAY16LE    \
-    )
-#define hasChroma(x)   (!(          \
-            isGray(x)               \
-        || (x)==PIX_FMT_MONOBLACK   \
-        || (x)==PIX_FMT_MONOWHITE   \
-    ))
-#define isALPHA(x)      (           \
-           (x)==PIX_FMT_BGR32       \
-        || (x)==PIX_FMT_BGR32_1     \
-        || (x)==PIX_FMT_RGB32       \
-        || (x)==PIX_FMT_RGB32_1     \
-        || (x)==PIX_FMT_YUVA420P    \
-    )
-
-static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h)
+#define isGray(x)                      \
+    ((x) == PIX_FMT_GRAY8       ||     \
+     (x) == PIX_FMT_Y400A       ||     \
+     (x) == PIX_FMT_GRAY16BE    ||     \
+     (x) == PIX_FMT_GRAY16LE)
+#define hasChroma(x)                   \
+    (!(isGray(x)                ||     \
+       (x) == PIX_FMT_MONOBLACK ||     \
+       (x) == PIX_FMT_MONOWHITE))
+#define isALPHA(x)                     \
+    ((x) == PIX_FMT_BGR32   ||         \
+     (x) == PIX_FMT_BGR32_1 ||         \
+     (x) == PIX_FMT_RGB32   ||         \
+     (x) == PIX_FMT_RGB32_1 ||         \
+     (x) == PIX_FMT_YUVA420P)
+
+static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1,
+                       int stride2, int w, int h)
 {
-    int x,y;
-    uint64_t ssd=0;
+    int x, y;
+    uint64_t ssd = 0;
 
-    for (y=0; y<h; y++) {
-        for (x=0; x<w; x++) {
-            int d= src1[x + y*stride1] - src2[x + y*stride2];
-            ssd+= d*d;
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < w; x++) {
+            int d = src1[x + y * stride1] - src2[x + y * stride2];
+            ssd += d * d;
         }
     }
     return ssd;
@@ -86,14 +85,14 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
     static int cur_srcW, cur_srcH;
     static uint8_t *src[4];
     static int srcStride[4];
-    uint8_t *dst[4] = {0};
-    uint8_t *out[4] = {0};
+    uint8_t *dst[4] = { 0 };
+    uint8_t *out[4] = { 0 };
     int dstStride[4];
     int i;
-    uint64_t ssdY, ssdU=0, ssdV=0, ssdA=0;
+    uint64_t ssdY, ssdU = 0, ssdV = 0, ssdA = 0;
     struct SwsContext *dstContext = NULL, *outContext = NULL;
     uint32_t crc = 0;
-    int res = 0;
+    int res      = 0;
 
     if (cur_srcFormat != srcFormat || cur_srcW != srcW || cur_srcH != srcH) {
         struct SwsContext *srcContext = NULL;
@@ -106,11 +105,10 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
         for (p = 0; p < 4; p++) {
             srcStride[p] = FFALIGN(srcStride[p], 16);
             if (srcStride[p])
-                src[p] = av_mallocz(srcStride[p]*srcH+16);
+                src[p] = av_mallocz(srcStride[p] * srcH + 16);
             if (srcStride[p] && !src[p]) {
                 perror("Malloc");
                 res = -1;
-
                 goto end;
             }
         }
@@ -121,19 +119,18 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
                     av_pix_fmt_descriptors[PIX_FMT_YUVA420P].name,
                     av_pix_fmt_descriptors[srcFormat].name);
             res = -1;
-
             goto end;
         }
         sws_scale(srcContext, ref, refStride, 0, h, src, srcStride);
         sws_freeContext(srcContext);
 
         cur_srcFormat = srcFormat;
-        cur_srcW = srcW;
-        cur_srcH = srcH;
+        cur_srcW      = srcW;
+        cur_srcH      = srcH;
     }
 
     av_image_fill_linesizes(dstStride, dstFormat, dstW);
-    for (i=0; i<4; i++) {
+    for (i = 0; i < 4; i++) {
         /* Image buffers passed into libswscale can be allocated any way you
          * prefer, as long as they're aligned enough for the architecture, and
          * they're freed appropriately (such as using av_free for buffers
@@ -142,7 +139,7 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
          * out of bounds. */
         dstStride[i] = FFALIGN(dstStride[i], 16);
         if (dstStride[i])
-            dst[i]= av_mallocz(dstStride[i]*dstH+16);
+            dst[i] = av_mallocz(dstStride[i] * dstH + 16);
         if (dstStride[i] && !dst[i]) {
             perror("Malloc");
             res = -1;
@@ -151,13 +148,13 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
         }
     }
 
-    dstContext= sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, NULL, NULL, NULL);
+    dstContext = sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat,
+                                flags, NULL, NULL, NULL);
     if (!dstContext) {
         fprintf(stderr, "Failed to get %s ---> %s\n",
                 av_pix_fmt_descriptors[srcFormat].name,
                 av_pix_fmt_descriptors[dstFormat].name);
         res = -1;
-
         goto end;
     }
 
@@ -169,9 +166,9 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
 
     sws_scale(dstContext, src, srcStride, 0, srcH, dst, dstStride);
 
-    for (i = 0; i < 4 && dstStride[i]; i++) {
-        crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i], dstStride[i] * dstH);
-    }
+    for (i = 0; i < 4 && dstStride[i]; i++)
+        crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
+                     dstStride[i] * dstH);
 
     if (r && crc == r->crc) {
         ssdY = r->ssdY;
@@ -179,61 +176,60 @@ static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
         ssdV = r->ssdV;
         ssdA = r->ssdA;
     } else {
-        for (i=0; i<4; i++) {
+        for (i = 0; i < 4; i++) {
             refStride[i] = FFALIGN(refStride[i], 16);
             if (refStride[i])
-                out[i]= av_mallocz(refStride[i]*h);
+                out[i] = av_mallocz(refStride[i] * h);
             if (refStride[i] && !out[i]) {
                 perror("Malloc");
                 res = -1;
-
                 goto end;
             }
         }
-        outContext= sws_getContext(dstW, dstH, dstFormat, w, h, PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
+        outContext = sws_getContext(dstW, dstH, dstFormat, w, h,
+                                    PIX_FMT_YUVA420P, SWS_BILINEAR,
+                                    NULL, NULL, NULL);
         if (!outContext) {
             fprintf(stderr, "Failed to get %s ---> %s\n",
                     av_pix_fmt_descriptors[dstFormat].name,
                     av_pix_fmt_descriptors[PIX_FMT_YUVA420P].name);
             res = -1;
-
             goto end;
         }
         sws_scale(outContext, dst, dstStride, 0, dstH, out, refStride);
 
-        ssdY= getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
+        ssdY = getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
         if (hasChroma(srcFormat) && hasChroma(dstFormat)) {
             //FIXME check that output is really gray
-            ssdU= getSSD(ref[1], out[1], refStride[1], refStride[1], (w+1)>>1, (h+1)>>1);
-            ssdV= getSSD(ref[2], out[2], refStride[2], refStride[2], (w+1)>>1, (h+1)>>1);
+            ssdU = getSSD(ref[1], out[1], refStride[1], refStride[1],
+                          (w + 1) >> 1, (h + 1) >> 1);
+            ssdV = getSSD(ref[2], out[2], refStride[2], refStride[2],
+                          (w + 1) >> 1, (h + 1) >> 1);
         }
         if (isALPHA(srcFormat) && isALPHA(dstFormat))
-            ssdA= getSSD(ref[3], out[3], refStride[3], refStride[3], w, h);
+            ssdA = getSSD(ref[3], out[3], refStride[3], refStride[3], w, h);
 
-        ssdY/= w*h;
-        ssdU/= w*h/4;
-        ssdV/= w*h/4;
-        ssdA/= w*h;
+        ssdY /= w * h;
+        ssdU /= w * h / 4;
+        ssdV /= w * h / 4;
+        ssdA /= w * h;
 
         sws_freeContext(outContext);
 
-        for (i=0; i<4; i++) {
+        for (i = 0; i < 4; i++)
             if (refStride[i])
                 av_free(out[i]);
-        }
     }
 
-    printf(" CRC=%08x SSD=%5"PRId64",%5"PRId64",%5"PRId64",%5"PRId64"\n",
+    printf(" CRC=%08x SSD=%5"PRId64 ",%5"PRId64 ",%5"PRId64 ",%5"PRId64 "\n",
            crc, ssdY, ssdU, ssdV, ssdA);
 
 end:
-
     sws_freeContext(dstContext);
 
-    for (i=0; i<4; i++) {
+    for (i = 0; i < 4; i++)
         if (dstStride[i])
             av_free(dst[i]);
-    }
 
     return res;
 }
@@ -242,18 +238,18 @@ static void selfTest(uint8_t *ref[4], int refStride[4], int w, int h,
                      enum PixelFormat srcFormat_in,
                      enum PixelFormat dstFormat_in)
 {
-    const int flags[] = { SWS_FAST_BILINEAR,
-                          SWS_BILINEAR, SWS_BICUBIC,
-                          SWS_X       , SWS_POINT  , SWS_AREA, 0 };
-    const int srcW = w;
-    const int srcH = h;
-    const int dstW[] = { srcW - srcW/3, srcW, srcW + srcW/3, 0 };
-    const int dstH[] = { srcH - srcH/3, srcH, srcH + srcH/3, 0 };
+    const int flags[] = { SWS_FAST_BILINEAR, SWS_BILINEAR, SWS_BICUBIC,
+                          SWS_X, SWS_POINT, SWS_AREA, 0 };
+    const int srcW   = w;
+    const int srcH   = h;
+    const int dstW[] = { srcW - srcW / 3, srcW, srcW + srcW / 3, 0 };
+    const int dstH[] = { srcH - srcH / 3, srcH, srcH + srcH / 3, 0 };
     enum PixelFormat srcFormat, dstFormat;
 
     for (srcFormat = srcFormat_in != PIX_FMT_NONE ? srcFormat_in : 0;
          srcFormat < PIX_FMT_NB; srcFormat++) {
-        if (!sws_isSupportedInput(srcFormat) || !sws_isSupportedOutput(srcFormat))
+        if (!sws_isSupportedInput(srcFormat) ||
+            !sws_isSupportedOutput(srcFormat))
             continue;
 
         for (dstFormat = dstFormat_in != PIX_FMT_NONE ? dstFormat_in : 0;
@@ -261,7 +257,8 @@ static void selfTest(uint8_t *ref[4], int refStride[4], int w, int h,
             int i, j, k;
             int res = 0;
 
-            if (!sws_isSupportedInput(dstFormat) || !sws_isSupportedOutput(dstFormat))
+            if (!sws_isSupportedInput(dstFormat) ||
+                !sws_isSupportedOutput(dstFormat))
                 continue;
 
             printf("%s -> %s\n",
@@ -269,14 +266,13 @@ static void selfTest(uint8_t *ref[4], int refStride[4], int w, int h,
                    av_pix_fmt_descriptors[dstFormat].name);
             fflush(stdout);
 
-            for (k = 0; flags[k] && !res; k++) {
+            for (k = 0; flags[k] && !res; k++)
                 for (i = 0; dstW[i] && !res; i++)
                     for (j = 0; dstH[j] && !res; j++)
                         res = doTest(ref, refStride, w, h,
                                      srcFormat, dstFormat,
                                      srcW, srcH, dstW[i], dstH[j], flags[k],
                                      NULL);
-            }
             if (dstFormat_in != PIX_FMT_NONE)
                 break;
         }
@@ -302,13 +298,14 @@ static int fileTest(uint8_t *ref[4], int refStride[4], int w, int h, FILE *fp,
         int flags;
         int ret;
 
-        ret = sscanf(buf, " %12s %dx%d -> %12s %dx%d flags=%d CRC=%x"
-                          " SSD=%"PRId64", %"PRId64", %"PRId64", %"PRId64"\n",
-                          srcStr, &srcW, &srcH, dstStr, &dstW, &dstH,
-                          &flags, &r.crc, &r.ssdY, &r.ssdU, &r.ssdV, &r.ssdA);
+        ret = sscanf(buf,
+                     " %12s %dx%d -> %12s %dx%d flags=%d CRC=%x"
+                     " SSD=%"PRId64 ", %"PRId64 ", %"PRId64 ", %"PRId64 "\n",
+                     srcStr, &srcW, &srcH, dstStr, &dstW, &dstH,
+                     &flags, &r.crc, &r.ssdY, &r.ssdU, &r.ssdV, &r.ssdA);
         if (ret != 12) {
             srcStr[0] = dstStr[0] = 0;
-            ret = sscanf(buf, "%12s -> %12s\n", srcStr, dstStr);
+            ret       = sscanf(buf, "%12s -> %12s\n", srcStr, dstStr);
         }
 
         srcFormat = av_get_pix_fmt(srcStr);
@@ -342,12 +339,12 @@ int main(int argc, char **argv)
 {
     enum PixelFormat srcFormat = PIX_FMT_NONE;
     enum PixelFormat dstFormat = PIX_FMT_NONE;
-    uint8_t *rgb_data = av_malloc (W*H*4);
-    uint8_t *rgb_src[3]= {rgb_data, NULL, NULL};
-    int rgb_stride[3]={4*W, 0, 0};
-    uint8_t *data = av_malloc (4*W*H);
-    uint8_t *src[4]= {data, data+W*H, data+W*H*2, data+W*H*3};
-    int stride[4]={W, W, W, W};
+    uint8_t *rgb_data   = av_malloc(W * H * 4);
+    uint8_t *rgb_src[3] = { rgb_data, NULL, NULL };
+    int rgb_stride[3]   = { 4 * W, 0, 0 };
+    uint8_t *data       = av_malloc(4 * W * H);
+    uint8_t *src[4]     = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
+    int stride[4]       = { W, W, W, W };
     int x, y;
     struct SwsContext *sws;
     AVLFG rand;
@@ -357,41 +354,40 @@ int main(int argc, char **argv)
     if (!rgb_data || !data)
         return -1;
 
-    sws= sws_getContext(W/12, H/12, PIX_FMT_RGB32, W, H, PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
+    sws = sws_getContext(W / 12, H / 12, PIX_FMT_RGB32, W, H,
+                         PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
 
     av_lfg_init(&rand, 1);
 
-    for (y=0; y<H; y++) {
-        for (x=0; x<W*4; x++) {
-            rgb_data[ x + y*4*W]= av_lfg_get(&rand);
-        }
-    }
+    for (y = 0; y < H; y++)
+        for (x = 0; x < W * 4; x++)
+            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
     sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride);
     sws_freeContext(sws);
     av_free(rgb_data);
 
     for (i = 1; i < argc; i += 2) {
-        if (argv[i][0] != '-' || i+1 == argc)
+        if (argv[i][0] != '-' || i + 1 == argc)
             goto bad_option;
         if (!strcmp(argv[i], "-ref")) {
-            FILE *fp = fopen(argv[i+1], "r");
+            FILE *fp = fopen(argv[i + 1], "r");
             if (!fp) {
-                fprintf(stderr, "could not open '%s'\n", argv[i+1]);
+                fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
                 goto error;
             }
             res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
             fclose(fp);
             goto end;
         } else if (!strcmp(argv[i], "-src")) {
-            srcFormat = av_get_pix_fmt(argv[i+1]);
+            srcFormat = av_get_pix_fmt(argv[i + 1]);
             if (srcFormat == PIX_FMT_NONE) {
-                fprintf(stderr, "invalid pixel format %s\n", argv[i+1]);
+                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
                 return -1;
             }
         } else if (!strcmp(argv[i], "-dst")) {
-            dstFormat = av_get_pix_fmt(argv[i+1]);
+            dstFormat = av_get_pix_fmt(argv[i + 1]);
             if (dstFormat == PIX_FMT_NONE) {
-                fprintf(stderr, "invalid pixel format %s\n", argv[i+1]);
+                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
                 return -1;
             }
         } else {
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index dca6b073d0..3cb9bfdd27 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1637,12 +1637,16 @@ rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0
 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
+rgb16_32_wrapper(PIX_FMT_BGR444LE, bgr12le, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT+4)
 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
+rgb16_32_wrapper(PIX_FMT_RGB444LE, rgb12le, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT+4)
 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
+rgb16_32_wrapper(PIX_FMT_BGR444BE, bgr12be, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT+4)
 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
+rgb16_32_wrapper(PIX_FMT_RGB444BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT+4)
 
 static void gbr24pToUV_half_c(uint16_t *dstU, uint16_t *dstV,
                          const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
@@ -2887,6 +2891,8 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
+        case PIX_FMT_BGR444LE: c->chrToYV12 = bgr12leToUV_half_c; break;
+        case PIX_FMT_BGR444BE: c->chrToYV12 = bgr12beToUV_half_c; break;
         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
@@ -2895,6 +2901,8 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
         case PIX_FMT_GBR24P  : c->chrToYV12 = gbr24pToUV_half_c;  break;
+        case PIX_FMT_RGB444LE: c->chrToYV12 = rgb12leToUV_half_c; break;
+        case PIX_FMT_RGB444BE: c->chrToYV12 = rgb12beToUV_half_c; break;
         }
     } else {
         switch(srcFormat) {
@@ -2909,6 +2917,8 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
+        case PIX_FMT_BGR444LE: c->chrToYV12 = bgr12leToUV_c; break;
+        case PIX_FMT_BGR444BE: c->chrToYV12 = bgr12beToUV_c; break;
         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
@@ -2916,6 +2926,8 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
+        case PIX_FMT_RGB444LE: c->chrToYV12 = rgb12leToUV_c; break;
+        case PIX_FMT_RGB444BE: c->chrToYV12 = rgb12beToUV_c; break;
         }
     }
 
@@ -2960,11 +2972,15 @@ static av_cold void sws_init_swScale_c(SwsContext *c)
     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
+    case PIX_FMT_BGR444LE : c->lumToYV12 = bgr12leToY_c; break;
+    case PIX_FMT_BGR444BE : c->lumToYV12 = bgr12beToY_c; break;
     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
+    case PIX_FMT_RGB444LE : c->lumToYV12 = rgb12leToY_c; break;
+    case PIX_FMT_RGB444BE : c->lumToYV12 = rgb12beToY_c; break;
     case PIX_FMT_RGB8     :
     case PIX_FMT_BGR8     :
     case PIX_FMT_PAL8     :
diff --git a/libswscale/swscale.h b/libswscale/swscale.h
index f65d0767c0..fa7100c41a 100644
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@@ -135,7 +135,6 @@ const char *swscale_license(void);
  */
 const int *sws_getCoefficients(int colorspace);
 
-
 // when used for filters they must have an odd number of elements
 // coeffs cannot be shared between vectors
 typedef struct {
@@ -235,9 +234,9 @@ struct SwsContext *sws_getContext(int srcW, int srcH, enum PixelFormat srcFormat
  *                  the destination image
  * @return          the height of the output slice
  */
-int sws_scale(struct SwsContext *c, const uint8_t* const srcSlice[],
+int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
               const int srcStride[], int srcSliceY, int srcSliceH,
-              uint8_t* const dst[], const int dstStride[]);
+              uint8_t *const dst[], const int dstStride[]);
 
 /**
  * @param inv_table the yuv2rgb coefficients, normally ff_yuv2rgb_coeffs[x]
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index a9830eff20..f05925f842 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -32,9 +32,9 @@
 #include "libavutil/pixfmt.h"
 #include "libavutil/pixdesc.h"
 
-#define STR(s)         AV_TOSTRING(s) //AV_STRINGIFY is too long
+#define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
 
-#define FAST_BGR2YV12 //use 7-bit instead of 15-bit coefficients
+#define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
 
 #define MAX_FILTER_SIZE 256
 
@@ -47,21 +47,20 @@
 #endif
 
 #if ARCH_X86_64
-#   define APCK_PTR2 8
+#   define APCK_PTR2  8
 #   define APCK_COEF 16
 #   define APCK_SIZE 24
 #else
-#   define APCK_PTR2 4
-#   define APCK_COEF 8
+#   define APCK_PTR2  4
+#   define APCK_COEF  8
 #   define APCK_SIZE 16
 #endif
 
 struct SwsContext;
 
-typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t* src[],
+typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t *src[],
                        int srcStride[], int srcSliceY, int srcSliceH,
-                       uint8_t* dst[], int dstStride[]);
-
+                       uint8_t *dst[], int dstStride[]);
 
 /**
  * Write one line of horizontally scaled data to planar output
@@ -75,8 +74,8 @@ typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t* src[],
  * @param dither  ordered dither array of type int16_t and size 8
  * @param offset  Dither offset
  */
-typedef void (*yuv2planar1_fn) (const int16_t *src, uint8_t *dest, int dstW,
-                                const uint8_t *dither, int offset);
+typedef void (*yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW,
+                               const uint8_t *dither, int offset);
 
 /**
  * Write one line of horizontally scaled data to planar output
@@ -91,9 +90,9 @@ typedef void (*yuv2planar1_fn) (const int16_t *src, uint8_t *dest, int dstW,
  * @param dstW          width of destination pixels
  * @param offset        Dither offset
  */
-typedef void (*yuv2planarX_fn) (const int16_t *filter, int filterSize,
-                                const int16_t **src, uint8_t *dest, int dstW,
-                                const uint8_t *dither, int offset);
+typedef void (*yuv2planarX_fn)(const int16_t *filter, int filterSize,
+                               const int16_t **src, uint8_t *dest, int dstW,
+                               const uint8_t *dither, int offset);
 
 /**
  * Write one line of horizontally scaled chroma to interleaved output
@@ -110,9 +109,12 @@ typedef void (*yuv2planarX_fn) (const int16_t *filter, int filterSize,
  *                      output, this is in uint16_t
  * @param dstW          width of chroma planes
  */
-typedef void (*yuv2interleavedX_fn) (struct SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
-                                     const int16_t **chrUSrc, const int16_t **chrVSrc,
-                                     uint8_t *dest, int dstW);
+typedef void (*yuv2interleavedX_fn)(struct SwsContext *c,
+                                    const int16_t *chrFilter,
+                                    int chrFilterSize,
+                                    const int16_t **chrUSrc,
+                                    const int16_t **chrVSrc,
+                                    uint8_t *dest, int dstW);
 
 /**
  * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
@@ -143,10 +145,11 @@ typedef void (*yuv2interleavedX_fn) (struct SwsContext *c, const int16_t *chrFil
  *                but can be used to generate comfort noise using dithering
  *                for some output formats.
  */
-typedef void (*yuv2packed1_fn) (struct SwsContext *c,  const int16_t *lumSrc,
-                                const int16_t *chrUSrc[2], const int16_t *chrVSrc[2],
-                                const int16_t *alpSrc,  uint8_t *dest,
-                                int dstW, int uvalpha, int y);
+typedef void (*yuv2packed1_fn)(struct SwsContext *c, const int16_t *lumSrc,
+                               const int16_t *chrUSrc[2],
+                               const int16_t *chrVSrc[2],
+                               const int16_t *alpSrc, uint8_t *dest,
+                               int dstW, int uvalpha, int y);
 /**
  * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
  * output by doing bilinear scaling between two input lines.
@@ -175,10 +178,12 @@ typedef void (*yuv2packed1_fn) (struct SwsContext *c,  const int16_t *lumSrc,
  *                but can be used to generate comfort noise using dithering
  *                for some output formats.
  */
-typedef void (*yuv2packed2_fn) (struct SwsContext *c,  const int16_t *lumSrc[2],
-                                const int16_t *chrUSrc[2], const int16_t *chrVSrc[2],
-                                const int16_t *alpSrc[2], uint8_t *dest,
-                                int dstW, int yalpha, int uvalpha, int y);
+typedef void (*yuv2packed2_fn)(struct SwsContext *c, const int16_t *lumSrc[2],
+                               const int16_t *chrUSrc[2],
+                               const int16_t *chrVSrc[2],
+                               const int16_t *alpSrc[2],
+                               uint8_t *dest,
+                               int dstW, int yalpha, int uvalpha, int y);
 /**
  * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
  * output by doing multi-point vertical scaling between input pixels.
@@ -205,12 +210,13 @@ typedef void (*yuv2packed2_fn) (struct SwsContext *c,  const int16_t *lumSrc[2],
  *                      but can be used to generate comfort noise using dithering
  *                      or some output formats.
  */
-typedef void (*yuv2packedX_fn) (struct SwsContext *c, const int16_t *lumFilter,
-                                const int16_t **lumSrc, int lumFilterSize,
-                                const int16_t *chrFilter, const int16_t **chrUSrc,
-                                const int16_t **chrVSrc, int chrFilterSize,
-                                const int16_t **alpSrc, uint8_t *dest,
-                                int dstW, int y);
+typedef void (*yuv2packedX_fn)(struct SwsContext *c, const int16_t *lumFilter,
+                               const int16_t **lumSrc, int lumFilterSize,
+                               const int16_t *chrFilter,
+                               const int16_t **chrUSrc,
+                               const int16_t **chrVSrc, int chrFilterSize,
+                               const int16_t **alpSrc, uint8_t *dest,
+                               int dstW, int y);
 
 /* This struct should be aligned on at least a 32-byte boundary. */
 typedef struct SwsContext {
@@ -263,12 +269,12 @@ typedef struct SwsContext {
     int16_t **chrUPixBuf;         ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
     int16_t **chrVPixBuf;         ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
     int16_t **alpPixBuf;          ///< Ring buffer for scaled horizontal alpha  plane lines to be fed to the vertical scaler.
-    int       vLumBufSize;        ///< Number of vertical luma/alpha lines allocated in the ring buffer.
-    int       vChrBufSize;        ///< Number of vertical chroma     lines allocated in the ring buffer.
-    int       lastInLumBuf;       ///< Last scaled horizontal luma/alpha line from source in the ring buffer.
-    int       lastInChrBuf;       ///< Last scaled horizontal chroma     line from source in the ring buffer.
-    int       lumBufIndex;        ///< Index in ring buffer of the last scaled horizontal luma/alpha line from source.
-    int       chrBufIndex;        ///< Index in ring buffer of the last scaled horizontal chroma     line from source.
+    int vLumBufSize;              ///< Number of vertical luma/alpha lines allocated in the ring buffer.
+    int vChrBufSize;              ///< Number of vertical chroma     lines allocated in the ring buffer.
+    int lastInLumBuf;             ///< Last scaled horizontal luma/alpha line from source in the ring buffer.
+    int lastInChrBuf;             ///< Last scaled horizontal chroma     line from source in the ring buffer.
+    int lumBufIndex;              ///< Index in ring buffer of the last scaled horizontal luma/alpha line from source.
+    int chrBufIndex;              ///< Index in ring buffer of the last scaled horizontal chroma     line from source.
     //@}
 
     uint8_t *formatConvBuffer;
@@ -295,10 +301,10 @@ typedef struct SwsContext {
     int16_t *hChrFilterPos;       ///< Array of horizontal filter starting positions for each dst[i] for chroma     planes.
     int16_t *vLumFilterPos;       ///< Array of vertical   filter starting positions for each dst[i] for luma/alpha planes.
     int16_t *vChrFilterPos;       ///< Array of vertical   filter starting positions for each dst[i] for chroma     planes.
-    int      hLumFilterSize;      ///< Horizontal filter size for luma/alpha pixels.
-    int      hChrFilterSize;      ///< Horizontal filter size for chroma     pixels.
-    int      vLumFilterSize;      ///< Vertical   filter size for luma/alpha pixels.
-    int      vChrFilterSize;      ///< Vertical   filter size for chroma     pixels.
+    int hLumFilterSize;           ///< Horizontal filter size for luma/alpha pixels.
+    int hChrFilterSize;           ///< Horizontal filter size for chroma     pixels.
+    int vLumFilterSize;           ///< Vertical   filter size for luma/alpha pixels.
+    int vChrFilterSize;           ///< Vertical   filter size for chroma     pixels.
     //@}
 
     int lumMmx2FilterCodeSize;    ///< Runtime-generated MMX2 horizontal fast bilinear scaler code size for luma/alpha planes.
@@ -310,11 +316,11 @@ typedef struct SwsContext {
 
     int dstY;                     ///< Last destination vertical line output from last slice.
     int flags;                    ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
-    void * yuvTable;            // pointer to the yuv->rgb table start so it can be freed()
-    uint8_t * table_rV[256];
-    uint8_t * table_gU[256];
-    int    table_gV[256];
-    uint8_t * table_bU[256];
+    void *yuvTable;             // pointer to the yuv->rgb table start so it can be freed()
+    uint8_t *table_rV[256];
+    uint8_t *table_gU[256];
+    int table_gV[256];
+    uint8_t *table_bU[256];
 
     //Colorspace stuff
     int contrast, brightness, saturation;    // for sws_getColorspaceDetails
@@ -366,15 +372,15 @@ typedef struct SwsContext {
     DECLARE_ALIGNED(8, uint64_t, yOffset);
     DECLARE_ALIGNED(8, uint64_t, uOffset);
     DECLARE_ALIGNED(8, uint64_t, vOffset);
-    int32_t  lumMmxFilter[4*MAX_FILTER_SIZE];
-    int32_t  chrMmxFilter[4*MAX_FILTER_SIZE];
+    int32_t lumMmxFilter[4 * MAX_FILTER_SIZE];
+    int32_t chrMmxFilter[4 * MAX_FILTER_SIZE];
     int dstW;                     ///< Width  of destination luma/alpha planes.
     DECLARE_ALIGNED(8, uint64_t, esp);
     DECLARE_ALIGNED(8, uint64_t, vRounder);
     DECLARE_ALIGNED(8, uint64_t, u_temp);
     DECLARE_ALIGNED(8, uint64_t, v_temp);
     DECLARE_ALIGNED(8, uint64_t, y_temp);
-    int32_t  alpMmxFilter[4*MAX_FILTER_SIZE];
+    int32_t alpMmxFilter[4 * MAX_FILTER_SIZE];
     // alignment of these values is not necessary, but merely here
     // to maintain the same offset across x8632 and x86-64. Once we
     // use proper offset macros in the asm, they can be removed.
@@ -393,7 +399,7 @@ typedef struct SwsContext {
     vector signed short   CGV;
     vector signed short   OY;
     vector unsigned short CSHIFT;
-    vector signed short   *vYCoeffsBank, *vCCoeffsBank;
+    vector signed short  *vYCoeffsBank, *vCCoeffsBank;
 #endif
 
 #if ARCH_BFIN
@@ -423,21 +429,25 @@ typedef struct SwsContext {
     yuv2packed2_fn yuv2packed2;
     yuv2packedX_fn yuv2packedX;
 
+    /// Unscaled conversion of luma plane to YV12 for horizontal scaler.
     void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
-                      int width, uint32_t *pal); ///< Unscaled conversion of luma plane to YV12 for horizontal scaler.
+                      int width, uint32_t *pal);
+    /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
     void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
-                      int width, uint32_t *pal); ///< Unscaled conversion of alpha plane to YV12 for horizontal scaler.
+                      int width, uint32_t *pal);
+    /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
     void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
                       const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
-                      int width, uint32_t *pal); ///< Unscaled conversion of chroma planes to YV12 for horizontal scaler.
+                      int width, uint32_t *pal);
 
     /**
-      * Functions to read planar input, such as planar RGB, and convert
-      * internally to Y/UV.
-      */
+     * Functions to read planar input, such as planar RGB, and convert
+     * internally to Y/UV.
+     */
     /** @{ */
     void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width);
-    void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4], int width);
+    void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4],
+                          int width);
     /** @} */
 
     /**
@@ -499,19 +509,20 @@ typedef struct SwsContext {
      *                   to simplify creating SIMD code.
      */
     /** @{ */
-    void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
-                    const int16_t *filter, const int16_t *filterPos,
-                    int filterSize);
-    void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
-                    const int16_t *filter, const int16_t *filterPos,
-                    int filterSize);
+    void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW,
+                    const uint8_t *src, const int16_t *filter,
+                    const int16_t *filterPos, int filterSize);
+    void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW,
+                    const uint8_t *src, const int16_t *filter,
+                    const int16_t *filterPos, int filterSize);
     /** @} */
 
-    void (*lumConvertRange)(int16_t *dst, int width); ///< Color range conversion function for luma plane if needed.
-    void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width); ///< Color range conversion function for chroma planes if needed.
+    /// Color range conversion function for luma plane if needed.
+    void (*lumConvertRange)(int16_t *dst, int width);
+    /// Color range conversion function for chroma planes if needed.
+    void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width);
 
     int needs_hcscale; ///< Set if there are chroma planes to be converted.
-
 } SwsContext;
 //FIXME check init (where 0)
 
@@ -567,53 +578,54 @@ const char *sws_format_name(enum PixelFormat format);
     (!(av_pix_fmt_descriptors[x].flags & PIX_FMT_PAL) && \
      av_pix_fmt_descriptors[x].nb_components <= 2)
 #else
-#define isGray(x)       (           \
-           (x)==PIX_FMT_GRAY8       \
-        || (x)==PIX_FMT_GRAY8A      \
-        || (x)==PIX_FMT_GRAY16BE    \
-        || (x)==PIX_FMT_GRAY16LE    \
-    )
+#define isGray(x)                      \
+    ((x) == PIX_FMT_GRAY8       ||     \
+     (x) == PIX_FMT_Y400A       ||     \
+     (x) == PIX_FMT_GRAY16BE    ||     \
+     (x) == PIX_FMT_GRAY16LE)
 #endif
 
-#define isRGBinInt(x)   (           \
-           (x)==PIX_FMT_RGB48BE     \
-        || (x)==PIX_FMT_RGB48LE     \
-        || (x)==PIX_FMT_RGBA64BE    \
-        || (x)==PIX_FMT_RGBA64LE    \
-        || (x)==PIX_FMT_RGB32       \
-        || (x)==PIX_FMT_RGB32_1     \
-        || (x)==PIX_FMT_RGB24       \
-        || (x)==PIX_FMT_RGB565BE    \
-        || (x)==PIX_FMT_RGB565LE    \
-        || (x)==PIX_FMT_RGB555BE    \
-        || (x)==PIX_FMT_RGB555LE    \
-        || (x)==PIX_FMT_RGB444BE    \
-        || (x)==PIX_FMT_RGB444LE    \
-        || (x)==PIX_FMT_RGB8        \
-        || (x)==PIX_FMT_RGB4        \
-        || (x)==PIX_FMT_RGB4_BYTE   \
-        || (x)==PIX_FMT_MONOBLACK   \
-        || (x)==PIX_FMT_MONOWHITE   \
+#define isRGBinInt(x) \
+    (           \
+        (x)==PIX_FMT_RGB48BE   ||   \
+        (x)==PIX_FMT_RGB48LE   ||   \
+        (x)==PIX_FMT_RGBA64BE  ||   \
+        (x)==PIX_FMT_RGBA64LE  ||   \
+        (x)==PIX_FMT_RGB32     ||   \
+        (x)==PIX_FMT_RGB32_1   ||   \
+        (x)==PIX_FMT_RGB24     ||   \
+        (x)==PIX_FMT_RGB565BE  ||   \
+        (x)==PIX_FMT_RGB565LE  ||   \
+        (x)==PIX_FMT_RGB555BE  ||   \
+        (x)==PIX_FMT_RGB555LE  ||   \
+        (x)==PIX_FMT_RGB444BE  ||   \
+        (x)==PIX_FMT_RGB444LE  ||   \
+        (x)==PIX_FMT_RGB8      ||   \
+        (x)==PIX_FMT_RGB4      ||   \
+        (x)==PIX_FMT_RGB4_BYTE ||   \
+        (x)==PIX_FMT_MONOBLACK ||   \
+        (x)==PIX_FMT_MONOWHITE   \
     )
-#define isBGRinInt(x)   (           \
-           (x)==PIX_FMT_BGR48BE     \
-        || (x)==PIX_FMT_BGR48LE     \
-        || (x)==PIX_FMT_BGRA64BE    \
-        || (x)==PIX_FMT_BGRA64LE    \
-        || (x)==PIX_FMT_BGR32       \
-        || (x)==PIX_FMT_BGR32_1     \
-        || (x)==PIX_FMT_BGR24       \
-        || (x)==PIX_FMT_BGR565BE    \
-        || (x)==PIX_FMT_BGR565LE    \
-        || (x)==PIX_FMT_BGR555BE    \
-        || (x)==PIX_FMT_BGR555LE    \
-        || (x)==PIX_FMT_BGR444BE    \
-        || (x)==PIX_FMT_BGR444LE    \
-        || (x)==PIX_FMT_BGR8        \
-        || (x)==PIX_FMT_BGR4        \
-        || (x)==PIX_FMT_BGR4_BYTE   \
-        || (x)==PIX_FMT_MONOBLACK   \
-        || (x)==PIX_FMT_MONOWHITE   \
+#define isBGRinInt(x) \
+    (           \
+         (x)==PIX_FMT_BGR48BE  ||   \
+         (x)==PIX_FMT_BGR48LE  ||   \
+         (x)==PIX_FMT_BGRA64BE ||   \
+         (x)==PIX_FMT_BGRA64LE ||   \
+         (x)==PIX_FMT_BGR32    ||   \
+         (x)==PIX_FMT_BGR32_1  ||   \
+         (x)==PIX_FMT_BGR24    ||   \
+         (x)==PIX_FMT_BGR565BE ||   \
+         (x)==PIX_FMT_BGR565LE ||   \
+         (x)==PIX_FMT_BGR555BE ||   \
+         (x)==PIX_FMT_BGR555LE ||   \
+         (x)==PIX_FMT_BGR444BE ||   \
+         (x)==PIX_FMT_BGR444LE ||   \
+         (x)==PIX_FMT_BGR8     ||   \
+         (x)==PIX_FMT_BGR4     ||   \
+         (x)==PIX_FMT_BGR4_BYTE||   \
+         (x)==PIX_FMT_MONOBLACK||   \
+         (x)==PIX_FMT_MONOWHITE   \
     )
 
 #define isRGBinBytes(x) (           \
@@ -635,10 +647,11 @@ const char *sws_format_name(enum PixelFormat format);
         || (x)==PIX_FMT_BGR24       \
     )
 
-#define isAnyRGB(x)     (           \
-            isRGBinInt(x)           \
-        ||  isBGRinInt(x)           \
-        ||  (x)==PIX_FMT_GBR24P     \
+#define isAnyRGB(x) \
+    (           \
+          isRGBinInt(x)       ||    \
+          isBGRinInt(x)       ||    \
+          (x)==PIX_FMT_GBR24P     \
     )
 
 #define isALPHA(x) \
@@ -655,15 +668,14 @@ const char *sws_format_name(enum PixelFormat format);
         ||  isBGRinInt(x)           \
     )
 #else
-#define isPacked(x) (\
-    (av_pix_fmt_descriptors[x].nb_components >= 2 && \
-     !(av_pix_fmt_descriptors[x].flags & PIX_FMT_PLANAR)) || \
-    (x) == PIX_FMT_PAL8\
-    )
+#define isPacked(x)                                            \
+    ((av_pix_fmt_descriptors[x].nb_components >= 2         &&  \
+      !(av_pix_fmt_descriptors[x].flags & PIX_FMT_PLANAR)) ||  \
+     (x) == PIX_FMT_PAL8)
 
 #endif
 #define isPlanar(x) \
-    (av_pix_fmt_descriptors[x].nb_components >= 2 && \
+    (av_pix_fmt_descriptors[x].nb_components >= 2          &&  \
      (av_pix_fmt_descriptors[x].flags & PIX_FMT_PLANAR))
 
 #define usePal(x) ((av_pix_fmt_descriptors[x].flags & PIX_FMT_PAL) || (x) == PIX_FMT_Y400A)
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 7d87a13617..1bbe58e58b 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -129,10 +129,10 @@ const static FormatEntry format_entries[PIX_FMT_NB] = {
     [PIX_FMT_YUV422P16BE] = { 1 , 1 },
     [PIX_FMT_YUV444P16LE] = { 1 , 1 },
     [PIX_FMT_YUV444P16BE] = { 1 , 1 },
-    [PIX_FMT_RGB444LE]    = { 0 , 1 },
-    [PIX_FMT_RGB444BE]    = { 0 , 1 },
-    [PIX_FMT_BGR444LE]    = { 0 , 1 },
-    [PIX_FMT_BGR444BE]    = { 0 , 1 },
+    [PIX_FMT_RGB444LE]    = { 1 , 1 },
+    [PIX_FMT_RGB444BE]    = { 1 , 1 },
+    [PIX_FMT_BGR444LE]    = { 1 , 1 },
+    [PIX_FMT_BGR444BE]    = { 1 , 1 },
     [PIX_FMT_Y400A]       = { 1 , 0 },
     [PIX_FMT_BGR48BE]     = { 1 , 1 },
     [PIX_FMT_BGR48LE]     = { 1 , 1 },
diff --git a/libswscale/x86/input.asm b/libswscale/x86/input.asm
new file mode 100644
index 0000000000..af53dab7d6
--- /dev/null
+++ b/libswscale/x86/input.asm
@@ -0,0 +1,242 @@
+;******************************************************************************
+;* x86-optimized input routines; does shuffling of packed
+;* YUV formats into individual planes, and converts RGB
+;* into YUV planes also.
+;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
+;
+; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
+; and
+; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
+;                      const uint8_t *unused, int w);
+;-----------------------------------------------------------------------------
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = yuyv or uyvy
+%macro LOOP_YUYV_TO_Y 2
+.loop_%1:
+    mov%1          m0, [srcq+wq*2]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+%ifidn %2, yuyv
+    pand           m0, m2                 ; (word) { Y0, Y1, ..., Y7 }
+    pand           m1, m2                 ; (word) { Y8, Y9, ..., Y15 }
+%else ; uyvy
+    psrlw          m0, 8                  ; (word) { Y0, Y1, ..., Y7 }
+    psrlw          m1, 8                  ; (word) { Y8, Y9, ..., Y15 }
+%endif ; yuyv/uyvy
+    packuswb       m0, m1                 ; (byte) { Y0, ..., Y15 }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = yuyv or uyvy
+; %3 = if specified, it means that unaligned and aligned code in loop
+;      will be the same (i.e. YUYV+AVX), and thus we don't need to
+;      split the loop in an aligned and unaligned case
+%macro YUYV_TO_Y_FN 2-3
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
+%ifdef ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    add          dstq, wq
+%if mmsize == 16
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*2]
+%ifidn %2, yuyv
+    pcmpeqb        m2, m2                 ; (byte) { 0xff } x 16
+    psrlw          m2, 8                  ; (word) { 0x00ff } x 8
+%endif ; yuyv
+%if mmsize == 16
+    jnz .loop_u_start
+    neg            wq
+    LOOP_YUYV_TO_Y  a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_YUYV_TO_Y  u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_YUYV_TO_Y  a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = yuyv or uyvy
+%macro LOOP_YUYV_TO_UV 2
+.loop_%1:
+%ifidn %2, yuyv
+    mov%1          m0, [srcq+wq*4]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+    psrlw          m0, 8                  ; (word) { U0, V0, ..., U3, V3 }
+    psrlw          m1, 8                  ; (word) { U4, V4, ..., U7, V7 }
+%else ; uyvy
+%if cpuflag(avx)
+    vpand          m0, m2, [srcq+wq*4]        ; (word) { U0, V0, ..., U3, V3 }
+    vpand          m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
+%else
+    mov%1          m0, [srcq+wq*4]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+    pand           m0, m2                 ; (word) { U0, V0, ..., U3, V3 }
+    pand           m1, m2                 ; (word) { U4, V4, ..., U7, V7 }
+%endif
+%endif ; yuyv/uyvy
+    packuswb       m0, m1                 ; (byte) { U0, V0, ..., U7, V7 }
+    pand           m1, m0, m2             ; (word) { U0, U1, ..., U7 }
+    psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
+%if mmsize == 16
+    packuswb       m1, m0                 ; (byte) { U0, ... U7, V1, ... V7 }
+    movh   [dstUq+wq], m1
+    movhps [dstVq+wq], m1
+%else ; mmsize == 8
+    packuswb       m1, m1                 ; (byte) { U0, ... U3 }
+    packuswb       m0, m0                 ; (byte) { V0, ... V3 }
+    movh   [dstUq+wq], m1
+    movh   [dstVq+wq], m0
+%endif ; mmsize == 8/16
+    add            wq, mmsize / 2
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = yuyv or uyvy
+; %3 = if specified, it means that unaligned and aligned code in loop
+;      will be the same (i.e. UYVY+AVX), and thus we don't need to
+;      split the loop in an aligned and unaligned case
+%macro YUYV_TO_UV_FN 2-3
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+%ifdef ARCH_X86_64
+    movsxd         wq, r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add         dstUq, wq
+    add         dstVq, wq
+%if mmsize == 16 && %0 == 2
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*4]
+    pcmpeqb        m2, m2                 ; (byte) { 0xff } x 16
+    psrlw          m2, 8                  ; (word) { 0x00ff } x 8
+    ; NOTE: if uyvy+avx, u/a are identical
+%if mmsize == 16 && %0 == 2
+    jnz .loop_u_start
+    neg            wq
+    LOOP_YUYV_TO_UV a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_YUYV_TO_UV u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_YUYV_TO_UV a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = nv12 or nv21
+%macro LOOP_NVXX_TO_UV 2
+.loop_%1:
+    mov%1          m0, [srcq+wq*2]        ; (byte) { U0, V0, U1, V1, ... }
+    mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
+    pand           m2, m0, m5             ; (word) { U0, U1, ..., U7 }
+    pand           m3, m1, m5             ; (word) { U8, U9, ..., U15 }
+    psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
+    psrlw          m1, 8                  ; (word) { V8, V9, ..., V15 }
+    packuswb       m2, m3                 ; (byte) { U0, ..., U15 }
+    packuswb       m0, m1                 ; (byte) { V0, ..., V15 }
+%ifidn %2, nv12
+    mova   [dstUq+wq], m2
+    mova   [dstVq+wq], m0
+%else ; nv21
+    mova   [dstVq+wq], m2
+    mova   [dstUq+wq], m0
+%endif ; nv12/21
+    add            wq, mmsize
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = nv12 or nv21
+%macro NVXX_TO_UV_FN 2
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+%ifdef ARCH_X86_64
+    movsxd         wq, r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add         dstUq, wq
+    add         dstVq, wq
+%if mmsize == 16
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*2]
+    pcmpeqb        m5, m5                 ; (byte) { 0xff } x 16
+    psrlw          m5, 8                  ; (word) { 0x00ff } x 8
+%if mmsize == 16
+    jnz .loop_u_start
+    neg            wq
+    LOOP_NVXX_TO_UV a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_NVXX_TO_UV u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_NVXX_TO_UV a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+%ifdef ARCH_X86_32
+INIT_MMX mmx
+YUYV_TO_Y_FN  0, yuyv
+YUYV_TO_Y_FN  0, uyvy
+YUYV_TO_UV_FN 0, yuyv
+YUYV_TO_UV_FN 0, uyvy
+NVXX_TO_UV_FN 0, nv12
+NVXX_TO_UV_FN 0, nv21
+%endif
+
+INIT_XMM sse2
+YUYV_TO_Y_FN  3, yuyv
+YUYV_TO_Y_FN  2, uyvy
+YUYV_TO_UV_FN 3, yuyv
+YUYV_TO_UV_FN 3, uyvy
+NVXX_TO_UV_FN 5, nv12
+NVXX_TO_UV_FN 5, nv21
+
+INIT_XMM avx
+; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
+; that's not faster in practice
+YUYV_TO_UV_FN 3, yuyv
+YUYV_TO_UV_FN 3, uyvy, 1
+NVXX_TO_UV_FN 5, nv12
+NVXX_TO_UV_FN 5, nv21
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index 7cac5d80fd..66c4f69394 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -307,6 +307,26 @@ VSCALE_FUNCS(sse2, sse2);
 VSCALE_FUNC(16, sse4);
 VSCALE_FUNCS(avx, avx);
 
+#define INPUT_UV_FUNC(fmt, opt) \
+extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
+                                       const uint8_t *src, const uint8_t *unused1, \
+                                       int w, uint32_t *unused2)
+#define INPUT_FUNC(fmt, opt) \
+extern void ff_ ## fmt ## ToY_  ## opt(uint8_t *dst, const uint8_t *src, \
+                                       int w, uint32_t *unused); \
+    INPUT_UV_FUNC(fmt, opt)
+#define INPUT_FUNCS(opt) \
+    INPUT_FUNC(uyvy, opt); \
+    INPUT_FUNC(yuyv, opt); \
+    INPUT_UV_FUNC(nv12, opt); \
+    INPUT_UV_FUNC(nv21, opt)
+
+#if ARCH_X86_32
+INPUT_FUNCS(mmx);
+#endif
+INPUT_FUNCS(sse2);
+INPUT_FUNCS(avx);
+
 void ff_sws_init_swScale_mmx(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -366,6 +386,30 @@ switch(c->dstBpc){ \
         ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
         ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2);
+
+        switch (c->srcFormat) {
+        case PIX_FMT_Y400A:
+            c->lumToYV12 = ff_yuyvToY_mmx;
+            if (c->alpPixBuf)
+                c->alpToYV12 = ff_uyvyToY_mmx;
+            break;
+        case PIX_FMT_YUYV422:
+            c->lumToYV12 = ff_yuyvToY_mmx;
+            c->chrToYV12 = ff_yuyvToUV_mmx;
+            break;
+        case PIX_FMT_UYVY422:
+            c->lumToYV12 = ff_uyvyToY_mmx;
+            c->chrToYV12 = ff_uyvyToUV_mmx;
+            break;
+        case PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_mmx;
+            break;
+        case PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_mmx;
+            break;
+        default:
+            break;
+        }
     }
     if (cpu_flags & AV_CPU_FLAG_MMX2) {
         ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmx2,);
@@ -384,6 +428,28 @@ switch(c->dstBpc){ \
         ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
         ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2,);
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
+
+        switch (c->srcFormat) {
+        case PIX_FMT_Y400A:
+            c->lumToYV12 = ff_yuyvToY_sse2;
+            if (c->alpPixBuf)
+                c->alpToYV12 = ff_uyvyToY_sse2;
+            break;
+        case PIX_FMT_YUYV422:
+            c->lumToYV12 = ff_yuyvToY_sse2;
+            c->chrToYV12 = ff_yuyvToUV_sse2;
+            break;
+        case PIX_FMT_UYVY422:
+            c->lumToYV12 = ff_uyvyToY_sse2;
+            c->chrToYV12 = ff_uyvyToUV_sse2;
+            break;
+        case PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_sse2;
+            break;
+        case PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_sse2;
+            break;
+        }
     }
     if (cpu_flags & AV_CPU_FLAG_SSSE3) {
         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
@@ -402,6 +468,23 @@ switch(c->dstBpc){ \
     if (cpu_flags & AV_CPU_FLAG_AVX) {
         ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx,);
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
+
+        switch (c->srcFormat) {
+        case PIX_FMT_YUYV422:
+            c->chrToYV12 = ff_yuyvToUV_avx;
+            break;
+        case PIX_FMT_UYVY422:
+            c->chrToYV12 = ff_uyvyToUV_avx;
+            break;
+        case PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_avx;
+            break;
+        case PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_avx;
+            break;
+        default:
+            break;
+        }
     }
 #endif
 }
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index bb351c2394..79c63b7d47 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -1435,147 +1435,6 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
     }
 }
 
-#if !COMPILE_TEMPLATE_MMX2
-//FIXME yuy2* can read up to 7 samples too much
-
-static void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
-                            int width, uint32_t *unused)
-{
-    __asm__ volatile(
-        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
-        "mov                    %0, %%"REG_a"       \n\t"
-        "1:                                         \n\t"
-        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
-        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
-        "pand                %%mm2, %%mm0           \n\t"
-        "pand                %%mm2, %%mm1           \n\t"
-        "packuswb            %%mm1, %%mm0           \n\t"
-        "movq                %%mm0, (%2, %%"REG_a") \n\t"
-        "add                    $8, %%"REG_a"       \n\t"
-        " js                    1b                  \n\t"
-        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
-        : "%"REG_a
-    );
-}
-
-static void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV,
-                             const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
-                             int width, uint32_t *unused)
-{
-    __asm__ volatile(
-        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
-        "mov                    %0, %%"REG_a"       \n\t"
-        "1:                                         \n\t"
-        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
-        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
-        "psrlw                  $8, %%mm0           \n\t"
-        "psrlw                  $8, %%mm1           \n\t"
-        "packuswb            %%mm1, %%mm0           \n\t"
-        "movq                %%mm0, %%mm1           \n\t"
-        "psrlw                  $8, %%mm0           \n\t"
-        "pand                %%mm4, %%mm1           \n\t"
-        "packuswb            %%mm0, %%mm0           \n\t"
-        "packuswb            %%mm1, %%mm1           \n\t"
-        "movd                %%mm0, (%3, %%"REG_a") \n\t"
-        "movd                %%mm1, (%2, %%"REG_a") \n\t"
-        "add                    $4, %%"REG_a"       \n\t"
-        " js                    1b                  \n\t"
-        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
-        : "%"REG_a
-    );
-    assert(src1 == src2);
-}
-
-/* This is almost identical to the previous, end exists only because
- * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
-static void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
-                            int width, uint32_t *unused)
-{
-    __asm__ volatile(
-        "mov                  %0, %%"REG_a"         \n\t"
-        "1:                                         \n\t"
-        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
-        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
-        "psrlw                $8, %%mm0             \n\t"
-        "psrlw                $8, %%mm1             \n\t"
-        "packuswb          %%mm1, %%mm0             \n\t"
-        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
-        "add                  $8, %%"REG_a"         \n\t"
-        " js                  1b                    \n\t"
-        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
-        : "%"REG_a
-    );
-}
-
-static void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV,
-                             const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
-                             int width, uint32_t *unused)
-{
-    __asm__ volatile(
-        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
-        "mov                    %0, %%"REG_a"       \n\t"
-        "1:                                         \n\t"
-        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
-        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
-        "pand                %%mm4, %%mm0           \n\t"
-        "pand                %%mm4, %%mm1           \n\t"
-        "packuswb            %%mm1, %%mm0           \n\t"
-        "movq                %%mm0, %%mm1           \n\t"
-        "psrlw                  $8, %%mm0           \n\t"
-        "pand                %%mm4, %%mm1           \n\t"
-        "packuswb            %%mm0, %%mm0           \n\t"
-        "packuswb            %%mm1, %%mm1           \n\t"
-        "movd                %%mm0, (%3, %%"REG_a") \n\t"
-        "movd                %%mm1, (%2, %%"REG_a") \n\t"
-        "add                    $4, %%"REG_a"       \n\t"
-        " js                    1b                  \n\t"
-        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
-        : "%"REG_a
-    );
-    assert(src1 == src2);
-}
-
-static av_always_inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
-                                              const uint8_t *src, int width)
-{
-    __asm__ volatile(
-        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
-        "mov                    %0, %%"REG_a"       \n\t"
-        "1:                                         \n\t"
-        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
-        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
-        "movq                %%mm0, %%mm2           \n\t"
-        "movq                %%mm1, %%mm3           \n\t"
-        "pand                %%mm4, %%mm0           \n\t"
-        "pand                %%mm4, %%mm1           \n\t"
-        "psrlw                  $8, %%mm2           \n\t"
-        "psrlw                  $8, %%mm3           \n\t"
-        "packuswb            %%mm1, %%mm0           \n\t"
-        "packuswb            %%mm3, %%mm2           \n\t"
-        "movq                %%mm0, (%2, %%"REG_a") \n\t"
-        "movq                %%mm2, (%3, %%"REG_a") \n\t"
-        "add                    $8, %%"REG_a"       \n\t"
-        " js                    1b                  \n\t"
-        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
-        : "%"REG_a
-    );
-}
-
-static void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
-                             const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
-                             int width, uint32_t *unused)
-{
-    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
-}
-
-static void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
-                             const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
-                             int width, uint32_t *unused)
-{
-    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
-}
-#endif /* !COMPILE_TEMPLATE_MMX2 */
-
 static av_always_inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src,
                                                   int width, enum PixelFormat srcFormat)
 {
@@ -1927,15 +1786,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
 #endif /* COMPILE_TEMPLATE_MMX2 */
     }
 
-#if !COMPILE_TEMPLATE_MMX2
-    switch(srcFormat) {
-        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
-        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
-        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
-        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
-        default: break;
-    }
-#endif /* !COMPILE_TEMPLATE_MMX2 */
     if (!c->chrSrcHSubSample) {
         switch(srcFormat) {
         case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
@@ -1945,21 +1795,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
     }
 
     switch (srcFormat) {
-#if !COMPILE_TEMPLATE_MMX2
-    case PIX_FMT_YUYV422  :
-    case PIX_FMT_Y400A    : c->lumToYV12 = RENAME(yuy2ToY); break;
-    case PIX_FMT_UYVY422  : c->lumToYV12 = RENAME(uyvyToY); break;
-#endif /* !COMPILE_TEMPLATE_MMX2 */
     case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
     case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
     default: break;
     }
-#if !COMPILE_TEMPLATE_MMX2
-    if (c->alpPixBuf) {
-        switch (srcFormat) {
-        case PIX_FMT_Y400A  : c->alpToYV12 = RENAME(yuy2ToY); break;
-        default: break;
-        }
-    }
-#endif /* !COMPILE_TEMPLATE_MMX2 */
 }