From befa8e665c460244ba523131e6c22b19ab269e77 Mon Sep 17 00:00:00 2001
From: Ramiro Polla <ramiro.polla@gmail.com>
Date: Thu, 26 Mar 2009 01:30:10 +0000
Subject: Move yuv2rgb code to subdirs.

Originally committed as revision 29063 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
---
 libswscale/Makefile               |  14 +-
 libswscale/bfin/yuv2rgb_bfin.c    | 203 ++++++++
 libswscale/mlib/yuv2rgb_mlib.c    |  85 ++++
 libswscale/ppc/yuv2rgb_altivec.c  | 962 ++++++++++++++++++++++++++++++++++++++
 libswscale/sparc/yuv2rgb_vis.c    | 209 +++++++++
 libswscale/swscale_internal.h     |   4 +
 libswscale/x86/yuv2rgb_mmx.c      |  89 ++++
 libswscale/x86/yuv2rgb_template.c | 484 +++++++++++++++++++
 libswscale/yuv2rgb.c              |  53 +--
 libswscale/yuv2rgb_altivec.c      | 962 --------------------------------------
 libswscale/yuv2rgb_bfin.c         | 203 --------
 libswscale/yuv2rgb_mlib.c         |  85 ----
 libswscale/yuv2rgb_template.c     | 484 -------------------
 libswscale/yuv2rgb_vis.c          | 209 ---------
 14 files changed, 2047 insertions(+), 1999 deletions(-)
 create mode 100644 libswscale/bfin/yuv2rgb_bfin.c
 create mode 100644 libswscale/mlib/yuv2rgb_mlib.c
 create mode 100644 libswscale/ppc/yuv2rgb_altivec.c
 create mode 100644 libswscale/sparc/yuv2rgb_vis.c
 create mode 100644 libswscale/x86/yuv2rgb_mmx.c
 create mode 100644 libswscale/x86/yuv2rgb_template.c
 delete mode 100644 libswscale/yuv2rgb_altivec.c
 delete mode 100644 libswscale/yuv2rgb_bfin.c
 delete mode 100644 libswscale/yuv2rgb_mlib.c
 delete mode 100644 libswscale/yuv2rgb_template.c
 delete mode 100644 libswscale/yuv2rgb_vis.c

(limited to 'libswscale')

diff --git a/libswscale/Makefile b/libswscale/Makefile
index 4d17e5b2b7..448ad30594 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -7,14 +7,20 @@ HEADERS = swscale.h
 
 OBJS = rgb2rgb.o swscale.o swscale_avoption.o yuv2rgb.o
 
-OBJS-$(ARCH_BFIN)          +=  internal_bfin.o swscale_bfin.o yuv2rgb_bfin.o
-OBJS-$(CONFIG_MLIB)        +=  yuv2rgb_mlib.o
-OBJS-$(HAVE_ALTIVEC)       +=  yuv2rgb_altivec.o
-OBJS-$(HAVE_VIS)           +=  yuv2rgb_vis.o
+OBJS-$(ARCH_BFIN)          +=  internal_bfin.o swscale_bfin.o bfin/yuv2rgb_bfin.o
+OBJS-$(CONFIG_MLIB)        +=  mlib/yuv2rgb_mlib.o
+OBJS-$(HAVE_ALTIVEC)       +=  ppc/yuv2rgb_altivec.o
+OBJS-$(HAVE_VIS)           +=  sparc/yuv2rgb_vis.o
+
+MMX-OBJS-$(CONFIG_GPL)     +=  x86/yuv2rgb_mmx.o        \
+
+OBJS-$(HAVE_MMX)           +=  $(MMX-OBJS-yes)
 
 EXAMPLES  = swscale-example
 TESTPROGS = cs_test
 
+DIRS = bfin mlib ppc sparc x86
+
 include $(SUBDIR)../subdir.mak
 
 $(SUBDIR)cs_test: $(SUBDIR)cs_test.o $(SUBDIR)$(LIBNAME)
diff --git a/libswscale/bfin/yuv2rgb_bfin.c b/libswscale/bfin/yuv2rgb_bfin.c
new file mode 100644
index 0000000000..39e1ec87b5
--- /dev/null
+++ b/libswscale/bfin/yuv2rgb_bfin.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * Blackfin video color space converter operations
+ * convert I420 YV12 to RGB in various formats
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <assert.h>
+#include "config.h"
+#include <unistd.h>
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#ifdef __FDPIC__
+#define L1CODE __attribute__ ((l1_text))
+#else
+#define L1CODE
+#endif
+
+void ff_bfin_yuv2rgb555_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                             int w, uint32_t *coeffs) L1CODE;
+
+void ff_bfin_yuv2rgb565_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                             int w, uint32_t *coeffs) L1CODE;
+
+void ff_bfin_yuv2rgb24_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                            int w, uint32_t *coeffs) L1CODE;
+
+typedef void (* ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                            int w, uint32_t *coeffs);
+
+
+static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
+{
+    int oy;
+    oy      = c->yOffset&0xffff;
+    oy      = oy >> 3; // keep everything U8.0 for offset calculation
+
+    c->oc   = 128*0x01010101U;
+    c->oy   =  oy*0x01010101U;
+
+    /* copy 64bit vector coeffs down to 32bit vector coeffs */
+    c->cy  = c->yCoeff;
+    c->zero = 0;
+
+    if (rgb) {
+        c->crv = c->vrCoeff;
+        c->cbu = c->ubCoeff;
+        c->cgu = c->ugCoeff;
+        c->cgv = c->vgCoeff;
+    } else {
+        c->crv = c->ubCoeff;
+        c->cbu = c->vrCoeff;
+        c->cgu = c->vgCoeff;
+        c->cgv = c->ugCoeff;
+    }
+
+
+    if (masks == 555) {
+        c->rmask = 0x001f * 0x00010001U;
+        c->gmask = 0x03e0 * 0x00010001U;
+        c->bmask = 0x7c00 * 0x00010001U;
+    } else if (masks == 565) {
+        c->rmask = 0x001f * 0x00010001U;
+        c->gmask = 0x07e0 * 0x00010001U;
+        c->bmask = 0xf800 * 0x00010001U;
+    }
+}
+
+static int core_yuv420_rgb(SwsContext *c,
+                           uint8_t **in, int *instrides,
+                           int srcSliceY, int srcSliceH,
+                           uint8_t **oplanes, int *outstrides,
+                           ltransform lcscf, int rgb, int masks)
+{
+    uint8_t *py,*pu,*pv,*op;
+    int w  = instrides[0];
+    int h2 = srcSliceH>>1;
+    int i;
+
+    bfin_prepare_coefficients(c, rgb, masks);
+
+    py = in[0];
+    pu = in[1+(1^rgb)];
+    pv = in[1+(0^rgb)];
+
+    op = oplanes[0] + srcSliceY*outstrides[0];
+
+    for (i=0;i<h2;i++) {
+
+        lcscf(py, pu, pv, op, w, &c->oy);
+
+        py += instrides[0];
+        op += outstrides[0];
+
+        lcscf(py, pu, pv, op, w, &c->oy);
+
+        py += instrides[0];
+        pu += instrides[1];
+        pv += instrides[2];
+        op += outstrides[0];
+    }
+
+    return srcSliceH;
+}
+
+
+static int bfin_yuv420_rgb555(SwsContext *c,
+                              uint8_t **in, int *instrides,
+                              int srcSliceY, int srcSliceH,
+                              uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb555_line, 1, 555);
+}
+
+static int bfin_yuv420_bgr555(SwsContext *c,
+                              uint8_t **in, int *instrides,
+                              int srcSliceY, int srcSliceH,
+                              uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb555_line, 0, 555);
+}
+
+static int bfin_yuv420_rgb24(SwsContext *c,
+                             uint8_t **in, int *instrides,
+                             int srcSliceY, int srcSliceH,
+                             uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb24_line, 1, 888);
+}
+
+static int bfin_yuv420_bgr24(SwsContext *c,
+                             uint8_t **in, int *instrides,
+                             int srcSliceY, int srcSliceH,
+                             uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb24_line, 0, 888);
+}
+
+static int bfin_yuv420_rgb565(SwsContext *c,
+                              uint8_t **in, int *instrides,
+                              int srcSliceY, int srcSliceH,
+                              uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb565_line, 1, 565);
+}
+
+static int bfin_yuv420_bgr565(SwsContext *c,
+                              uint8_t **in, int *instrides,
+                              int srcSliceY, int srcSliceH,
+                              uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb565_line, 0, 565);
+}
+
+
+SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c)
+{
+    SwsFunc f;
+
+    switch(c->dstFormat) {
+    case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break;
+    case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break;
+    case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break;
+    case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break;
+    case PIX_FMT_RGB24:  f = bfin_yuv420_rgb24;  break;
+    case PIX_FMT_BGR24:  f = bfin_yuv420_bgr24;  break;
+    default:
+        return 0;
+    }
+
+    av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
+           sws_format_name (c->dstFormat));
+
+    return f;
+}
diff --git a/libswscale/mlib/yuv2rgb_mlib.c b/libswscale/mlib/yuv2rgb_mlib.c
new file mode 100644
index 0000000000..a4bdd1986c
--- /dev/null
+++ b/libswscale/mlib/yuv2rgb_mlib.c
@@ -0,0 +1,85 @@
+/*
+ * software YUV to RGB converter using mediaLib
+ *
+ * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <mlib_types.h>
+#include <mlib_status.h>
+#include <mlib_sys.h>
+#include <mlib_video.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "libswscale/swscale.h"
+
+static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                               int srcSliceH, uint8_t* dst[], int dstStride[]){
+    if(c->srcFormat == PIX_FMT_YUV422P){
+        srcStride[1] *= 2;
+        srcStride[2] *= 2;
+    }
+
+    assert(srcStride[1] == srcStride[2]);
+
+    mlib_VideoColorYUV2ARGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
+                               srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
+    return srcSliceH;
+}
+
+static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                               int srcSliceH, uint8_t* dst[], int dstStride[]){
+    if(c->srcFormat == PIX_FMT_YUV422P){
+        srcStride[1] *= 2;
+        srcStride[2] *= 2;
+    }
+
+    assert(srcStride[1] == srcStride[2]);
+
+    mlib_VideoColorYUV2ABGR420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
+                               srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
+    return srcSliceH;
+}
+
+static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                              int srcSliceH, uint8_t* dst[], int dstStride[]){
+    if(c->srcFormat == PIX_FMT_YUV422P){
+        srcStride[1] *= 2;
+        srcStride[2] *= 2;
+    }
+
+    assert(srcStride[1] == srcStride[2]);
+
+    mlib_VideoColorYUV2RGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
+                              srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
+    return srcSliceH;
+}
+
+
+SwsFunc ff_yuv2rgb_init_mlib(SwsContext *c)
+{
+    switch(c->dstFormat){
+    case PIX_FMT_RGB24: return mlib_YUV2RGB420_24;
+    case PIX_FMT_BGR32: return mlib_YUV2ARGB420_32;
+    case PIX_FMT_RGB32: return mlib_YUV2ABGR420_32;
+    default: return NULL;
+    }
+}
+
diff --git a/libswscale/ppc/yuv2rgb_altivec.c b/libswscale/ppc/yuv2rgb_altivec.c
new file mode 100644
index 0000000000..dc5894cda5
--- /dev/null
+++ b/libswscale/ppc/yuv2rgb_altivec.c
@@ -0,0 +1,962 @@
+/*
+ * AltiVec acceleration for colorspace conversion
+ *
+ * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+Convert I420 YV12 to RGB in various formats,
+  it rejects images that are not in 420 formats,
+  it rejects images that don't have widths of multiples of 16,
+  it rejects images that don't have heights of multiples of 2.
+Reject defers to C simulation code.
+
+Lots of optimizations to be done here.
+
+1. Need to fix saturation code. I just couldn't get it to fly with packs
+   and adds, so we currently use max/min to clip.
+
+2. The inefficient use of chroma loading needs a bit of brushing up.
+
+3. Analysis of pipeline stalls needs to be done. Use shark to identify
+   pipeline stalls.
+
+
+MODIFIED to calculate coeffs from currently selected color space.
+MODIFIED core to be a macro where you specify the output format.
+ADDED UYVY conversion which is never called due to some thing in swscale.
+CORRECTED algorithim selection to be strict on input formats.
+ADDED runtime detection of AltiVec.
+
+ADDED altivec_yuv2packedX vertical scl + RGB converter
+
+March 27,2004
+PERFORMANCE ANALYSIS
+
+The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
+used as test.
+The AltiVec version uses 10% of the processor or ~100Mips for D1 video
+same sequence.
+
+720 * 480 * 30  ~10MPS
+
+so we have roughly 10 clocks per pixel. This is too high, something has
+to be wrong.
+
+OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
+need for vec_min.
+
+OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
+the input video frame, it was just decompressed so it probably resides in L1
+caches. However, we are creating the output video stream. This needs to use the
+DSTST instruction to optimize for the cache. We couple this with the fact that
+we are not going to be visiting the input buffer again so we mark it Least
+Recently Used. This shaves 25% of the processor cycles off.
+
+Now memcpy is the largest mips consumer in the system, probably due
+to the inefficient X11 stuff.
+
+GL libraries seem to be very slow on this machine 1.33Ghz PB running
+Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
+a versioning issue, however I have libGL.1.2.dylib for both
+machines. (We need to figure this out now.)
+
+GL2 libraries work now with patch for RGB32.
+
+NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
+
+Integrated luma prescaling adjustment for saturation/contrast/brightness
+adjustment.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <assert.h>
+#include "config.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#undef PROFILE_THE_BEAST
+#undef INC_SCALING
+
+typedef unsigned char ubyte;
+typedef signed char   sbyte;
+
+
+/* RGB interleaver, 16 planar pels 8-bit samples per channel in
+   homogeneous vector registers x0,x1,x2 are interleaved with the
+   following technique:
+
+      o0 = vec_mergeh (x0,x1);
+      o1 = vec_perm (o0, x2, perm_rgb_0);
+      o2 = vec_perm (o0, x2, perm_rgb_1);
+      o3 = vec_mergel (x0,x1);
+      o4 = vec_perm (o3,o2,perm_rgb_2);
+      o5 = vec_perm (o3,o2,perm_rgb_3);
+
+  perm_rgb_0:   o0(RG).h v1(B) --> o1*
+              0   1  2   3   4
+             rgbr|gbrg|brgb|rgbr
+             0010 0100 1001 0010
+             0102 3145 2673 894A
+
+  perm_rgb_1:   o0(RG).h v1(B) --> o2
+              0   1  2   3   4
+             gbrg|brgb|bbbb|bbbb
+             0100 1001 1111 1111
+             B5CD 6EF7 89AB CDEF
+
+  perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
+              0   1  2   3   4
+             gbrg|brgb|rgbr|gbrg
+             1111 1111 0010 0100
+             89AB CDEF 0182 3945
+
+  perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
+              0   1  2   3   4
+             brgb|rgbr|gbrg|brgb
+             1001 0010 0100 1001
+             a67b 89cA BdCD eEFf
+
+*/
+static
+const vector unsigned char
+  perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
+                0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
+  perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
+                0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
+  perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
+                0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
+  perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
+                0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
+
+#define vec_merge3(x2,x1,x0,y0,y1,y2)       \
+do {                                        \
+    __typeof__(x0) o0,o2,o3;                \
+        o0 = vec_mergeh (x0,x1);            \
+        y0 = vec_perm (o0, x2, perm_rgb_0); \
+        o2 = vec_perm (o0, x2, perm_rgb_1); \
+        o3 = vec_mergel (x0,x1);            \
+        y1 = vec_perm (o3,o2,perm_rgb_2);   \
+        y2 = vec_perm (o3,o2,perm_rgb_3);   \
+} while(0)
+
+#define vec_mstbgr24(x0,x1,x2,ptr)      \
+do {                                    \
+    __typeof__(x0) _0,_1,_2;            \
+    vec_merge3 (x0,x1,x2,_0,_1,_2);     \
+    vec_st (_0, 0, ptr++);              \
+    vec_st (_1, 0, ptr++);              \
+    vec_st (_2, 0, ptr++);              \
+}  while (0);
+
+#define vec_mstrgb24(x0,x1,x2,ptr)      \
+do {                                    \
+    __typeof__(x0) _0,_1,_2;            \
+    vec_merge3 (x2,x1,x0,_0,_1,_2);     \
+    vec_st (_0, 0, ptr++);              \
+    vec_st (_1, 0, ptr++);              \
+    vec_st (_2, 0, ptr++);              \
+}  while (0);
+
+/* pack the pixels in rgb0 format
+   msb R
+   lsb 0
+*/
+#define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
+do {                                                                          \
+    T _0,_1,_2,_3;                                                            \
+    _0 = vec_mergeh (x0,x1);                                                  \
+    _1 = vec_mergeh (x2,x3);                                                  \
+    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
+    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
+    vec_st (_2, 0*16, (T *)ptr);                                              \
+    vec_st (_3, 1*16, (T *)ptr);                                              \
+    _0 = vec_mergel (x0,x1);                                                  \
+    _1 = vec_mergel (x2,x3);                                                  \
+    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
+    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
+    vec_st (_2, 2*16, (T *)ptr);                                              \
+    vec_st (_3, 3*16, (T *)ptr);                                              \
+    ptr += 4;                                                                 \
+}  while (0);
+
+/*
+
+  | 1     0       1.4021   | | Y |
+  | 1    -0.3441 -0.7142   |x| Cb|
+  | 1     1.7718  0        | | Cr|
+
+
+  Y:      [-128 127]
+  Cb/Cr : [-128 127]
+
+  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
+
+*/
+
+
+
+
+#define vec_unh(x) \
+    (vector signed short) \
+        vec_perm(x,(__typeof__(x)){0}, \
+                 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
+                                         0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
+#define vec_unl(x) \
+    (vector signed short) \
+        vec_perm(x,(__typeof__(x)){0}, \
+                 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
+                                         0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
+
+#define vec_clip_s16(x) \
+    vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
+                         ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
+
+#define vec_packclp(x,y) \
+    (vector unsigned char)vec_packs \
+        ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
+         (vector unsigned short)vec_max (y,((vector signed short) {0})))
+
+//#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
+
+
+static inline void cvtyuvtoRGB (SwsContext *c,
+                                vector signed short Y, vector signed short U, vector signed short V,
+                                vector signed short *R, vector signed short *G, vector signed short *B)
+{
+    vector signed   short vx,ux,uvx;
+
+    Y = vec_mradds (Y, c->CY, c->OY);
+    U  = vec_sub (U,(vector signed short)
+                    vec_splat((vector signed short){128},0));
+    V  = vec_sub (V,(vector signed short)
+                    vec_splat((vector signed short){128},0));
+
+    //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
+    ux = vec_sl (U, c->CSHIFT);
+    *B = vec_mradds (ux, c->CBU, Y);
+
+    // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
+    vx = vec_sl (V, c->CSHIFT);
+    *R = vec_mradds (vx, c->CRV, Y);
+
+    // uvx = ((CGU*u) + (CGV*v))>>15;
+    uvx = vec_mradds (U, c->CGU, Y);
+    *G  = vec_mradds (V, c->CGV, uvx);
+}
+
+
+/*
+  ------------------------------------------------------------------------------
+  CS converters
+  ------------------------------------------------------------------------------
+*/
+
+
+#define DEFCSP420_CVT(name,out_pixels)                                  \
+static int altivec_##name (SwsContext *c,                               \
+                           unsigned char **in, int *instrides,          \
+                           int srcSliceY,        int srcSliceH,         \
+                           unsigned char **oplanes, int *outstrides)    \
+{                                                                       \
+    int w = c->srcW;                                                    \
+    int h = srcSliceH;                                                  \
+    int i,j;                                                            \
+    int instrides_scl[3];                                               \
+    vector unsigned char y0,y1;                                         \
+                                                                        \
+    vector signed char  u,v;                                            \
+                                                                        \
+    vector signed short Y0,Y1,Y2,Y3;                                    \
+    vector signed short U,V;                                            \
+    vector signed short vx,ux,uvx;                                      \
+    vector signed short vx0,ux0,uvx0;                                   \
+    vector signed short vx1,ux1,uvx1;                                   \
+    vector signed short R0,G0,B0;                                       \
+    vector signed short R1,G1,B1;                                       \
+    vector unsigned char R,G,B;                                         \
+                                                                        \
+    vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
+    vector unsigned char align_perm;                                    \
+                                                                        \
+    vector signed short                                                 \
+        lCY  = c->CY,                                                   \
+        lOY  = c->OY,                                                   \
+        lCRV = c->CRV,                                                  \
+        lCBU = c->CBU,                                                  \
+        lCGU = c->CGU,                                                  \
+        lCGV = c->CGV;                                                  \
+                                                                        \
+    vector unsigned short lCSHIFT = c->CSHIFT;                          \
+                                                                        \
+    ubyte *y1i   = in[0];                                               \
+    ubyte *y2i   = in[0]+instrides[0];                                  \
+    ubyte *ui    = in[1];                                               \
+    ubyte *vi    = in[2];                                               \
+                                                                        \
+    vector unsigned char *oute                                          \
+        = (vector unsigned char *)                                      \
+            (oplanes[0]+srcSliceY*outstrides[0]);                       \
+    vector unsigned char *outo                                          \
+        = (vector unsigned char *)                                      \
+            (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
+                                                                        \
+                                                                        \
+    instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
+    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
+    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
+                                                                        \
+                                                                        \
+    for (i=0;i<h/2;i++) {                                               \
+        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
+        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
+                                                                        \
+        for (j=0;j<w/16;j++) {                                          \
+                                                                        \
+            y1ivP = (vector unsigned char *)y1i;                        \
+            y2ivP = (vector unsigned char *)y2i;                        \
+            uivP  = (vector unsigned char *)ui;                         \
+            vivP  = (vector unsigned char *)vi;                         \
+                                                                        \
+            align_perm = vec_lvsl (0, y1i);                             \
+            y0 = (vector unsigned char)                                 \
+                 vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
+                                                                        \
+            align_perm = vec_lvsl (0, y2i);                             \
+            y1 = (vector unsigned char)                                 \
+                 vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
+                                                                        \
+            align_perm = vec_lvsl (0, ui);                              \
+            u = (vector signed char)                                    \
+                vec_perm (uivP[0], uivP[1], align_perm);                \
+                                                                        \
+            align_perm = vec_lvsl (0, vi);                              \
+            v = (vector signed char)                                    \
+                vec_perm (vivP[0], vivP[1], align_perm);                \
+                                                                        \
+            u  = (vector signed char)                                   \
+                 vec_sub (u,(vector signed char)                        \
+                          vec_splat((vector signed char){128},0));      \
+            v  = (vector signed char)                                   \
+                 vec_sub (v,(vector signed char)                        \
+                          vec_splat((vector signed char){128},0));      \
+                                                                        \
+            U  = vec_unpackh (u);                                       \
+            V  = vec_unpackh (v);                                       \
+                                                                        \
+                                                                        \
+            Y0 = vec_unh (y0);                                          \
+            Y1 = vec_unl (y0);                                          \
+            Y2 = vec_unh (y1);                                          \
+            Y3 = vec_unl (y1);                                          \
+                                                                        \
+            Y0 = vec_mradds (Y0, lCY, lOY);                             \
+            Y1 = vec_mradds (Y1, lCY, lOY);                             \
+            Y2 = vec_mradds (Y2, lCY, lOY);                             \
+            Y3 = vec_mradds (Y3, lCY, lOY);                             \
+                                                                        \
+            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
+            ux = vec_sl (U, lCSHIFT);                                   \
+            ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
+            ux0  = vec_mergeh (ux,ux);                                  \
+            ux1  = vec_mergel (ux,ux);                                  \
+                                                                        \
+            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
+            vx = vec_sl (V, lCSHIFT);                                   \
+            vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
+            vx0  = vec_mergeh (vx,vx);                                  \
+            vx1  = vec_mergel (vx,vx);                                  \
+                                                                        \
+            /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
+            uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
+            uvx = vec_mradds (V, lCGV, uvx);                            \
+            uvx0 = vec_mergeh (uvx,uvx);                                \
+            uvx1 = vec_mergel (uvx,uvx);                                \
+                                                                        \
+            R0 = vec_add (Y0,vx0);                                      \
+            G0 = vec_add (Y0,uvx0);                                     \
+            B0 = vec_add (Y0,ux0);                                      \
+            R1 = vec_add (Y1,vx1);                                      \
+            G1 = vec_add (Y1,uvx1);                                     \
+            B1 = vec_add (Y1,ux1);                                      \
+                                                                        \
+            R  = vec_packclp (R0,R1);                                   \
+            G  = vec_packclp (G0,G1);                                   \
+            B  = vec_packclp (B0,B1);                                   \
+                                                                        \
+            out_pixels(R,G,B,oute);                                     \
+                                                                        \
+            R0 = vec_add (Y2,vx0);                                      \
+            G0 = vec_add (Y2,uvx0);                                     \
+            B0 = vec_add (Y2,ux0);                                      \
+            R1 = vec_add (Y3,vx1);                                      \
+            G1 = vec_add (Y3,uvx1);                                     \
+            B1 = vec_add (Y3,ux1);                                      \
+            R  = vec_packclp (R0,R1);                                   \
+            G  = vec_packclp (G0,G1);                                   \
+            B  = vec_packclp (B0,B1);                                   \
+                                                                        \
+                                                                        \
+            out_pixels(R,G,B,outo);                                     \
+                                                                        \
+            y1i  += 16;                                                 \
+            y2i  += 16;                                                 \
+            ui   += 8;                                                  \
+            vi   += 8;                                                  \
+                                                                        \
+        }                                                               \
+                                                                        \
+        outo  += (outstrides[0])>>4;                                    \
+        oute  += (outstrides[0])>>4;                                    \
+                                                                        \
+        ui    += instrides_scl[1];                                      \
+        vi    += instrides_scl[2];                                      \
+        y1i   += instrides_scl[0];                                      \
+        y2i   += instrides_scl[0];                                      \
+    }                                                                   \
+    return srcSliceH;                                                   \
+}
+
+
+#define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
+#define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
+#define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
+#define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
+#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
+#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
+
+DEFCSP420_CVT (yuv2_abgr, out_abgr)
+#if 1
+DEFCSP420_CVT (yuv2_bgra, out_bgra)
+#else
+static int altivec_yuv2_bgra32 (SwsContext *c,
+                                unsigned char **in, int *instrides,
+                                int srcSliceY,        int srcSliceH,
+                                unsigned char **oplanes, int *outstrides)
+{
+    int w = c->srcW;
+    int h = srcSliceH;
+    int i,j;
+    int instrides_scl[3];
+    vector unsigned char y0,y1;
+
+    vector signed char  u,v;
+
+    vector signed short Y0,Y1,Y2,Y3;
+    vector signed short U,V;
+    vector signed short vx,ux,uvx;
+    vector signed short vx0,ux0,uvx0;
+    vector signed short vx1,ux1,uvx1;
+    vector signed short R0,G0,B0;
+    vector signed short R1,G1,B1;
+    vector unsigned char R,G,B;
+
+    vector unsigned char *uivP, *vivP;
+    vector unsigned char align_perm;
+
+    vector signed short
+        lCY  = c->CY,
+        lOY  = c->OY,
+        lCRV = c->CRV,
+        lCBU = c->CBU,
+        lCGU = c->CGU,
+        lCGV = c->CGV;
+
+    vector unsigned short lCSHIFT = c->CSHIFT;
+
+    ubyte *y1i   = in[0];
+    ubyte *y2i   = in[0]+w;
+    ubyte *ui    = in[1];
+    ubyte *vi    = in[2];
+
+    vector unsigned char *oute
+        = (vector unsigned char *)
+          (oplanes[0]+srcSliceY*outstrides[0]);
+    vector unsigned char *outo
+        = (vector unsigned char *)
+          (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
+
+
+    instrides_scl[0] = instrides[0];
+    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
+    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
+
+
+    for (i=0;i<h/2;i++) {
+        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
+        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
+
+        for (j=0;j<w/16;j++) {
+
+            y0 = vec_ldl (0,y1i);
+            y1 = vec_ldl (0,y2i);
+            uivP = (vector unsigned char *)ui;
+            vivP = (vector unsigned char *)vi;
+
+            align_perm = vec_lvsl (0, ui);
+            u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
+
+            align_perm = vec_lvsl (0, vi);
+            v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
+            u  = (vector signed char)
+                 vec_sub (u,(vector signed char)
+                          vec_splat((vector signed char){128},0));
+
+            v  = (vector signed char)
+                 vec_sub (v, (vector signed char)
+                          vec_splat((vector signed char){128},0));
+
+            U  = vec_unpackh (u);
+            V  = vec_unpackh (v);
+
+
+            Y0 = vec_unh (y0);
+            Y1 = vec_unl (y0);
+            Y2 = vec_unh (y1);
+            Y3 = vec_unl (y1);
+
+            Y0 = vec_mradds (Y0, lCY, lOY);
+            Y1 = vec_mradds (Y1, lCY, lOY);
+            Y2 = vec_mradds (Y2, lCY, lOY);
+            Y3 = vec_mradds (Y3, lCY, lOY);
+
+            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
+            ux = vec_sl (U, lCSHIFT);
+            ux = vec_mradds (ux, lCBU, (vector signed short){0});
+            ux0  = vec_mergeh (ux,ux);
+            ux1  = vec_mergel (ux,ux);
+
+            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
+            vx = vec_sl (V, lCSHIFT);
+            vx = vec_mradds (vx, lCRV, (vector signed short){0});
+            vx0  = vec_mergeh (vx,vx);
+            vx1  = vec_mergel (vx,vx);
+            /* uvx = ((CGU*u) + (CGV*v))>>15 */
+            uvx = vec_mradds (U, lCGU, (vector signed short){0});
+            uvx = vec_mradds (V, lCGV, uvx);
+            uvx0 = vec_mergeh (uvx,uvx);
+            uvx1 = vec_mergel (uvx,uvx);
+            R0 = vec_add (Y0,vx0);
+            G0 = vec_add (Y0,uvx0);
+            B0 = vec_add (Y0,ux0);
+            R1 = vec_add (Y1,vx1);
+            G1 = vec_add (Y1,uvx1);
+            B1 = vec_add (Y1,ux1);
+            R  = vec_packclp (R0,R1);
+            G  = vec_packclp (G0,G1);
+            B  = vec_packclp (B0,B1);
+
+            out_argb(R,G,B,oute);
+            R0 = vec_add (Y2,vx0);
+            G0 = vec_add (Y2,uvx0);
+            B0 = vec_add (Y2,ux0);
+            R1 = vec_add (Y3,vx1);
+            G1 = vec_add (Y3,uvx1);
+            B1 = vec_add (Y3,ux1);
+            R  = vec_packclp (R0,R1);
+            G  = vec_packclp (G0,G1);
+            B  = vec_packclp (B0,B1);
+
+            out_argb(R,G,B,outo);
+            y1i  += 16;
+            y2i  += 16;
+            ui   += 8;
+            vi   += 8;
+
+        }
+
+        outo  += (outstrides[0])>>4;
+        oute  += (outstrides[0])>>4;
+
+        ui    += instrides_scl[1];
+        vi    += instrides_scl[2];
+        y1i   += instrides_scl[0];
+        y2i   += instrides_scl[0];
+    }
+    return srcSliceH;
+}
+
+#endif
+
+
+DEFCSP420_CVT (yuv2_rgba, out_rgba)
+DEFCSP420_CVT (yuv2_argb, out_argb)
+DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
+DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
+
+
+// uyvy|uyvy|uyvy|uyvy
+// 0123 4567 89ab cdef
+static
+const vector unsigned char
+    demux_u = {0x10,0x00,0x10,0x00,
+               0x10,0x04,0x10,0x04,
+               0x10,0x08,0x10,0x08,
+               0x10,0x0c,0x10,0x0c},
+    demux_v = {0x10,0x02,0x10,0x02,
+               0x10,0x06,0x10,0x06,
+               0x10,0x0A,0x10,0x0A,
+               0x10,0x0E,0x10,0x0E},
+    demux_y = {0x10,0x01,0x10,0x03,
+               0x10,0x05,0x10,0x07,
+               0x10,0x09,0x10,0x0B,
+               0x10,0x0D,0x10,0x0F};
+
+/*
+  this is so I can play live CCIR raw video
+*/
+static int altivec_uyvy_rgb32 (SwsContext *c,
+                               unsigned char **in, int *instrides,
+                               int srcSliceY,        int srcSliceH,
+                               unsigned char **oplanes, int *outstrides)
+{
+    int w = c->srcW;
+    int h = srcSliceH;
+    int i,j;
+    vector unsigned char uyvy;
+    vector signed   short Y,U,V;
+    vector signed   short R0,G0,B0,R1,G1,B1;
+    vector unsigned char  R,G,B;
+    vector unsigned char *out;
+    ubyte *img;
+
+    img = in[0];
+    out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
+
+    for (i=0;i<h;i++) {
+        for (j=0;j<w/16;j++) {
+            uyvy = vec_ld (0, img);
+            U = (vector signed short)
+                vec_perm (uyvy, (vector unsigned char){0}, demux_u);
+
+            V = (vector signed short)
+                vec_perm (uyvy, (vector unsigned char){0}, demux_v);
+
+            Y = (vector signed short)
+                vec_perm (uyvy, (vector unsigned char){0}, demux_y);
+
+            cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
+
+            uyvy = vec_ld (16, img);
+            U = (vector signed short)
+                vec_perm (uyvy, (vector unsigned char){0}, demux_u);
+
+            V = (vector signed short)
+                vec_perm (uyvy, (vector unsigned char){0}, demux_v);
+
+            Y = (vector signed short)
+                vec_perm (uyvy, (vector unsigned char){0}, demux_y);
+
+            cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
+
+            R  = vec_packclp (R0,R1);
+            G  = vec_packclp (G0,G1);
+            B  = vec_packclp (B0,B1);
+
+            //      vec_mstbgr24 (R,G,B, out);
+            out_rgba (R,G,B,out);
+
+            img += 32;
+        }
+    }
+    return srcSliceH;
+}
+
+
+
+/* Ok currently the acceleration routine only supports
+   inputs of widths a multiple of 16
+   and heights a multiple 2
+
+   So we just fall back to the C codes for this.
+*/
+SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
+{
+    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
+        return NULL;
+
+    /*
+      and this seems not to matter too much I tried a bunch of
+      videos with abnormal widths and MPlayer crashes elsewhere.
+      mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
+      boom with X11 bad match.
+
+    */
+    if ((c->srcW & 0xf) != 0)    return NULL;
+
+    switch (c->srcFormat) {
+    case PIX_FMT_YUV410P:
+    case PIX_FMT_YUV420P:
+    /*case IMGFMT_CLPL:        ??? */
+    case PIX_FMT_GRAY8:
+    case PIX_FMT_NV12:
+    case PIX_FMT_NV21:
+        if ((c->srcH & 0x1) != 0)
+            return NULL;
+
+        switch(c->dstFormat){
+        case PIX_FMT_RGB24:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
+            return altivec_yuv2_rgb24;
+        case PIX_FMT_BGR24:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
+            return altivec_yuv2_bgr24;
+        case PIX_FMT_ARGB:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
+            return altivec_yuv2_argb;
+        case PIX_FMT_ABGR:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
+            return altivec_yuv2_abgr;
+        case PIX_FMT_RGBA:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
+            return altivec_yuv2_rgba;
+        case PIX_FMT_BGRA:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
+            return altivec_yuv2_bgra;
+        default: return NULL;
+        }
+        break;
+
+    case PIX_FMT_UYVY422:
+        switch(c->dstFormat){
+        case PIX_FMT_BGR32:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
+            return altivec_uyvy_rgb32;
+        default: return NULL;
+        }
+        break;
+
+    }
+    return NULL;
+}
+
+void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
+{
+    union {
+        signed short tmp[8] __attribute__ ((aligned(16)));
+        vector signed short vec;
+    } buf;
+
+    buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
+    buf.tmp[1] =  -256*brightness;                                      //oy
+    buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
+    buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
+    buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
+    buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
+
+
+    c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
+    c->CY   = vec_splat ((vector signed short)buf.vec, 0);
+    c->OY   = vec_splat ((vector signed short)buf.vec, 1);
+    c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
+    c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
+    c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
+    c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
+#if 0
+    {
+    int i;
+    char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
+    for (i=0; i<6; i++)
+        printf("%s %d ", v[i],buf.tmp[i] );
+        printf("\n");
+    }
+#endif
+    return;
+}
+
+
+void
+ff_yuv2packedX_altivec(SwsContext *c,
+                     int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
+                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
+                     uint8_t *dest, int dstW, int dstY)
+{
+    int i,j;
+    vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
+    vector signed short R0,G0,B0,R1,G1,B1;
+
+    vector unsigned char R,G,B;
+    vector unsigned char *out,*nout;
+
+    vector signed short   RND = vec_splat_s16(1<<3);
+    vector unsigned short SCL = vec_splat_u16(4);
+    unsigned long scratch[16] __attribute__ ((aligned (16)));
+
+    vector signed short *YCoeffs, *CCoeffs;
+
+    YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
+    CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
+
+    out = (vector unsigned char *)dest;
+
+    for (i=0; i<dstW; i+=16){
+        Y0 = RND;
+        Y1 = RND;
+        /* extract 16 coeffs from lumSrc */
+        for (j=0; j<lumFilterSize; j++) {
+            X0 = vec_ld (0,  &lumSrc[j][i]);
+            X1 = vec_ld (16, &lumSrc[j][i]);
+            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
+            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
+        }
+
+        U = RND;
+        V = RND;
+        /* extract 8 coeffs from U,V */
+        for (j=0; j<chrFilterSize; j++) {
+            X  = vec_ld (0, &chrSrc[j][i/2]);
+            U  = vec_mradds (X, CCoeffs[j], U);
+            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
+            V  = vec_mradds (X, CCoeffs[j], V);
+        }
+
+        /* scale and clip signals */
+        Y0 = vec_sra (Y0, SCL);
+        Y1 = vec_sra (Y1, SCL);
+        U  = vec_sra (U,  SCL);
+        V  = vec_sra (V,  SCL);
+
+        Y0 = vec_clip_s16 (Y0);
+        Y1 = vec_clip_s16 (Y1);
+        U  = vec_clip_s16 (U);
+        V  = vec_clip_s16 (V);
+
+        /* now we have
+          Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
+          U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
+
+          Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
+          U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
+          V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
+        */
+
+        U0 = vec_mergeh (U,U);
+        V0 = vec_mergeh (V,V);
+
+        U1 = vec_mergel (U,U);
+        V1 = vec_mergel (V,V);
+
+        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
+        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
+
+        R  = vec_packclp (R0,R1);
+        G  = vec_packclp (G0,G1);
+        B  = vec_packclp (B0,B1);
+
+        switch(c->dstFormat) {
+            case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
+            case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
+            case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
+            case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
+            case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
+            case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
+            default:
+            {
+                /* If this is reached, the caller should have called yuv2packedXinC
+                   instead. */
+                static int printed_error_message;
+                if (!printed_error_message) {
+                    av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
+                           sws_format_name(c->dstFormat));
+                    printed_error_message=1;
+                }
+                return;
+            }
+        }
+    }
+
+    if (i < dstW) {
+        i -= 16;
+
+        Y0 = RND;
+        Y1 = RND;
+        /* extract 16 coeffs from lumSrc */
+        for (j=0; j<lumFilterSize; j++) {
+            X0 = vec_ld (0,  &lumSrc[j][i]);
+            X1 = vec_ld (16, &lumSrc[j][i]);
+            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
+            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
+        }
+
+        U = RND;
+        V = RND;
+        /* extract 8 coeffs from U,V */
+        for (j=0; j<chrFilterSize; j++) {
+            X  = vec_ld (0, &chrSrc[j][i/2]);
+            U  = vec_mradds (X, CCoeffs[j], U);
+            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
+            V  = vec_mradds (X, CCoeffs[j], V);
+        }
+
+        /* scale and clip signals */
+        Y0 = vec_sra (Y0, SCL);
+        Y1 = vec_sra (Y1, SCL);
+        U  = vec_sra (U,  SCL);
+        V  = vec_sra (V,  SCL);
+
+        Y0 = vec_clip_s16 (Y0);
+        Y1 = vec_clip_s16 (Y1);
+        U  = vec_clip_s16 (U);
+        V  = vec_clip_s16 (V);
+
+        /* now we have
+           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
+           U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
+
+           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
+           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
+           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
+        */
+
+        U0 = vec_mergeh (U,U);
+        V0 = vec_mergeh (V,V);
+
+        U1 = vec_mergel (U,U);
+        V1 = vec_mergel (V,V);
+
+        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
+        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
+
+        R  = vec_packclp (R0,R1);
+        G  = vec_packclp (G0,G1);
+        B  = vec_packclp (B0,B1);
+
+        nout = (vector unsigned char *)scratch;
+        switch(c->dstFormat) {
+            case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
+            case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
+            case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
+            case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
+            case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
+            case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
+            default:
+                /* Unreachable, I think. */
+                av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
+                       sws_format_name(c->dstFormat));
+                return;
+        }
+
+        memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
+    }
+
+}
diff --git a/libswscale/sparc/yuv2rgb_vis.c b/libswscale/sparc/yuv2rgb_vis.c
new file mode 100644
index 0000000000..cdbe140b20
--- /dev/null
+++ b/libswscale/sparc/yuv2rgb_vis.c
@@ -0,0 +1,209 @@
+/*
+ * VIS optimized software YUV to RGB converter
+ * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include <stdlib.h>
+
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#define YUV2RGB_INIT \
+    "wr %%g0, 0x10, %%gsr \n\t" \
+    "ldd [%5], %%f32      \n\t" \
+    "ldd [%5+8], %%f34    \n\t" \
+    "ldd [%5+16], %%f36   \n\t" \
+    "ldd [%5+24], %%f38   \n\t" \
+    "ldd [%5+32], %%f40   \n\t" \
+    "ldd [%5+40], %%f42   \n\t" \
+    "ldd [%5+48], %%f44   \n\t" \
+    "ldd [%5+56], %%f46   \n\t" \
+    "ldd [%5+64], %%f48   \n\t" \
+    "ldd [%5+72], %%f50   \n\t"
+
+#define YUV2RGB_KERNEL \
+    /* ^^^^ f0=Y f3=u f5=v */ \
+    "fmul8x16 %%f3, %%f48, %%f6   \n\t" \
+    "fmul8x16 %%f19, %%f48, %%f22 \n\t" \
+    "fmul8x16 %%f5, %%f44, %%f8   \n\t" \
+    "fmul8x16 %%f21, %%f44, %%f24 \n\t" \
+    "fmul8x16 %%f0, %%f42, %%f0   \n\t" \
+    "fmul8x16 %%f16, %%f42, %%f16 \n\t" \
+    "fmul8x16 %%f3, %%f50, %%f2   \n\t" \
+    "fmul8x16 %%f19, %%f50, %%f18 \n\t" \
+    "fmul8x16 %%f5, %%f46, %%f4   \n\t" \
+    "fmul8x16 %%f21, %%f46, %%f20 \n\t" \
+    \
+    "fpsub16 %%f6, %%f34, %%f6   \n\t" /* 1 */ \
+    "fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */ \
+    "fpsub16 %%f8, %%f38, %%f8   \n\t" /* 3 */ \
+    "fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */ \
+    "fpsub16 %%f0, %%f32, %%f0   \n\t" /* 0 */ \
+    "fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */ \
+    "fpsub16 %%f2, %%f36, %%f2   \n\t" /* 2 */ \
+    "fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */ \
+    "fpsub16 %%f4, %%f40, %%f4   \n\t" /* 4 */ \
+    "fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */ \
+    \
+    "fpadd16 %%f0, %%f8, %%f8    \n\t" /* Gt */ \
+    "fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */ \
+    "fpadd16 %%f0, %%f4, %%f4    \n\t" /* R */ \
+    "fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */ \
+    "fpadd16 %%f0, %%f6, %%f6    \n\t" /* B */ \
+    "fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */ \
+    "fpadd16 %%f8, %%f2, %%f2    \n\t" /* G */ \
+    "fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */ \
+    \
+    "fpack16 %%f4, %%f4    \n\t" \
+    "fpack16 %%f20, %%f20  \n\t" \
+    "fpack16 %%f6, %%f6    \n\t" \
+    "fpack16 %%f22, %%f22  \n\t" \
+    "fpack16 %%f2, %%f2    \n\t" \
+    "fpack16 %%f18, %%f18  \n\t"
+
+
+
+// FIXME: must be changed to set alpha to 255 instead of 0
+static int vis_420P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                           int srcSliceH, uint8_t* dst[], int dstStride[]){
+  int y, out1, out2, out3, out4, out5, out6;
+
+  for(y=0;y < srcSliceH;++y) {
+      __asm__ volatile (
+          YUV2RGB_INIT
+          "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
+          "1:                          \n\t"
+          "ldda [%1] %%asi, %%f2       \n\t"
+          "ldda [%1+2] %%asi, %%f18    \n\t"
+          "ldda [%2] %%asi, %%f4       \n\t"
+          "ldda [%2+2] %%asi, %%f20    \n\t"
+          "ld [%0], %%f0               \n\t"
+          "ld [%0+4], %%f16            \n\t"
+          "fpmerge %%f3, %%f3, %%f2    \n\t"
+          "fpmerge %%f19, %%f19, %%f18 \n\t"
+          "fpmerge %%f5, %%f5, %%f4    \n\t"
+          "fpmerge %%f21, %%f21, %%f20 \n\t"
+          YUV2RGB_KERNEL
+          "fzero %%f0                  \n\t"
+          "fpmerge %%f4, %%f6, %%f8    \n\t"  // r,b,t1
+          "fpmerge %%f20, %%f22, %%f24 \n\t"  // r,b,t1
+          "fpmerge %%f0, %%f2, %%f10   \n\t"  // 0,g,t2
+          "fpmerge %%f0, %%f18, %%f26  \n\t"  // 0,g,t2
+          "fpmerge %%f10, %%f8, %%f4   \n\t"  // t2,t1,msb
+          "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2,t1,msb
+          "fpmerge %%f11, %%f9, %%f6   \n\t"  // t2,t1,lsb
+          "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2,t1,lsb
+          "std %%f4, [%3]              \n\t"
+          "std %%f20, [%3+16]          \n\t"
+          "std %%f6, [%3+8]            \n\t"
+          "std %%f22, [%3+24]          \n\t"
+
+          "add %0, 8, %0   \n\t"
+          "add %1, 4, %1   \n\t"
+          "add %2, 4, %2   \n\t"
+          "subcc %4, 8, %4 \n\t"
+          "bne 1b          \n\t"
+          "add %3, 32, %3  \n\t" //delay slot
+          : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
+          : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+((y+srcSliceY)>>1)*srcStride[1]),
+            "2" (src[2]+((y+srcSliceY)>>1)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]),
+            "4" (c->dstW),
+            "5" (c->sparc_coeffs)
+      );
+  }
+
+  return srcSliceH;
+}
+
+// FIXME: must be changed to set alpha to 255 instead of 0
+static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                           int srcSliceH, uint8_t* dst[], int dstStride[]){
+  int y, out1, out2, out3, out4, out5, out6;
+
+  for(y=0;y < srcSliceH;++y) {
+      __asm__ volatile (
+          YUV2RGB_INIT
+          "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
+          "1:                          \n\t"
+          "ldda [%1] %%asi, %%f2       \n\t"
+          "ldda [%1+2] %%asi, %%f18    \n\t"
+          "ldda [%2] %%asi, %%f4       \n\t"
+          "ldda [%2+2] %%asi, %%f20    \n\t"
+          "ld [%0], %%f0               \n\t"
+          "ld [%0+4], %%f16            \n\t"
+          "fpmerge %%f3, %%f3, %%f2    \n\t"
+          "fpmerge %%f19, %%f19, %%f18 \n\t"
+          "fpmerge %%f5, %%f5, %%f4    \n\t"
+          "fpmerge %%f21, %%f21, %%f20 \n\t"
+          YUV2RGB_KERNEL
+          "fzero %%f0 \n\t"
+          "fpmerge %%f4, %%f6, %%f8    \n\t"  // r,b,t1
+          "fpmerge %%f20, %%f22, %%f24 \n\t"  // r,b,t1
+          "fpmerge %%f0, %%f2, %%f10   \n\t"  // 0,g,t2
+          "fpmerge %%f0, %%f18, %%f26  \n\t"  // 0,g,t2
+          "fpmerge %%f10, %%f8, %%f4   \n\t"  // t2,t1,msb
+          "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2,t1,msb
+          "fpmerge %%f11, %%f9, %%f6   \n\t"  // t2,t1,lsb
+          "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2,t1,lsb
+          "std %%f4, [%3]              \n\t"
+          "std %%f20, [%3+16]          \n\t"
+          "std %%f6, [%3+8]            \n\t"
+          "std %%f22, [%3+24]          \n\t"
+
+          "add %0, 8, %0   \n\t"
+          "add %1, 4, %1   \n\t"
+          "add %2, 4, %2   \n\t"
+          "subcc %4, 8, %4 \n\t"
+          "bne 1b          \n\t"
+          "add %3, 32, %3  \n\t" //delay slot
+          : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
+          : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+(y+srcSliceY)*srcStride[1]),
+            "2" (src[2]+(y+srcSliceY)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]),
+            "4" (c->dstW),
+            "5" (c->sparc_coeffs)
+      );
+  }
+
+  return srcSliceH;
+}
+
+SwsFunc ff_yuv2rgb_init_vis(SwsContext *c){
+    c->sparc_coeffs[5]=c->yCoeff;
+    c->sparc_coeffs[6]=c->vgCoeff;
+    c->sparc_coeffs[7]=c->vrCoeff;
+    c->sparc_coeffs[8]=c->ubCoeff;
+    c->sparc_coeffs[9]=c->ugCoeff;
+
+    c->sparc_coeffs[0]=(((int16_t)c->yOffset*(int16_t)c->yCoeff >>11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[1]=(((int16_t)c->uOffset*(int16_t)c->ubCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[2]=(((int16_t)c->uOffset*(int16_t)c->ugCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[3]=(((int16_t)c->vOffset*(int16_t)c->vgCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[4]=(((int16_t)c->vOffset*(int16_t)c->vrCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
+
+    if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV422P && (c->dstW & 7)==0) {
+        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV422P -> RGB32 (WARNING: alpha value is wrong)\n");
+        return vis_422P_ARGB32;
+    }
+    else if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV420P && (c->dstW & 7)==0) {
+        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV420P -> RGB32 (WARNING: alpha value is wrong)\n");
+        return vis_420P_ARGB32;
+    }
+    return NULL;
+}
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 7e385da93f..91f05ad004 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -221,7 +221,11 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c);
 int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
 
 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation);
+SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c);
+SwsFunc ff_yuv2rgb_init_vis(SwsContext *c);
+SwsFunc ff_yuv2rgb_init_mlib(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c);
+SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c);
 void ff_yuv2packedX_altivec(SwsContext *c,
                           int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
                           int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
diff --git a/libswscale/x86/yuv2rgb_mmx.c b/libswscale/x86/yuv2rgb_mmx.c
new file mode 100644
index 0000000000..ced537fcce
--- /dev/null
+++ b/libswscale/x86/yuv2rgb_mmx.c
@@ -0,0 +1,89 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2009 Konstantin Shishkov
+ *
+ * MMX/MMX2 template stuff (needed for fast movntq support),
+ * 1,4,8bpp support and context / deglobalize stuff
+ * by Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include "config.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/x86_cpu.h"
+
+#define DITHER1XBPP // only for MMX
+
+/* hope these constant values are cache line aligned */
+DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw)   = 0x00ff00ff00ff00ffULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
+
+//MMX versions
+#undef RENAME
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
+#define RENAME(a) a ## _MMX
+#include "yuv2rgb_template.c"
+
+//MMX2 versions
+#undef RENAME
+#undef HAVE_MMX2
+#define HAVE_MMX2 1
+#define RENAME(a) a ## _MMX2
+#include "yuv2rgb_template.c"
+
+SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
+{
+    if (c->flags & SWS_CPU_CAPS_MMX2) {
+        switch (c->dstFormat) {
+        case PIX_FMT_RGB32:
+            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P){
+                if (HAVE_7REGS) return yuva420_rgb32_MMX2;
+                break;
+            }else return yuv420_rgb32_MMX2;
+        case PIX_FMT_BGR24:  return yuv420_rgb24_MMX2;
+        case PIX_FMT_RGB565: return yuv420_rgb16_MMX2;
+        case PIX_FMT_RGB555: return yuv420_rgb15_MMX2;
+        }
+    }
+    if (c->flags & SWS_CPU_CAPS_MMX) {
+        switch (c->dstFormat) {
+        case PIX_FMT_RGB32:
+            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P){
+                if (HAVE_7REGS) return yuva420_rgb32_MMX;
+                break;
+            }else return yuv420_rgb32_MMX;
+        case PIX_FMT_BGR24:  return yuv420_rgb24_MMX;
+        case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
+        case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
+        }
+    }
+
+    return NULL;
+}
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
new file mode 100644
index 0000000000..798eff0e33
--- /dev/null
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -0,0 +1,484 @@
+/*
+ * yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
+ *
+ * Copyright (C) 2000, Silicon Integrated System Corp
+ *
+ * Author: Olie Lho <ollie@sis.com.tw>
+ *
+ * 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
+ * MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
+ * context / deglobalize stuff by Michael Niedermayer
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video decoder
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with mpeg2dec; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#undef MOVNTQ
+#undef EMMS
+#undef SFENCE
+
+#if HAVE_AMD3DNOW
+/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
+#define EMMS     "femms"
+#else
+#define EMMS     "emms"
+#endif
+
+#if HAVE_MMX2
+#define MOVNTQ "movntq"
+#define SFENCE "sfence"
+#else
+#define MOVNTQ "movq"
+#define SFENCE "/nop"
+#endif
+
+#define YUV2RGB \
+    /* Do the multiply part of the conversion for even and odd pixels,
+       register usage:
+       mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
+       mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
+       mm6 -> Y even, mm7 -> Y odd */\
+    /* convert the chroma part */\
+    "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
+    "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
+\
+    "psllw $3, %%mm0;" /* Promote precision */ \
+    "psllw $3, %%mm1;" /* Promote precision */ \
+\
+    "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \
+    "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \
+\
+    "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
+    "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
+\
+    "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
+    "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
+\
+    "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
+    "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
+\
+    "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
+\
+    /* convert the luma part */\
+    "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
+    "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
+\
+    "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\
+\
+    "psllw $3, %%mm6;" /* Promote precision */\
+    "psllw $3, %%mm7;" /* Promote precision */\
+\
+    "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\
+    "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\
+\
+    "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
+    "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
+\
+    /* Do the addition part of the conversion for even and odd pixels,
+       register usage:
+       mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
+       mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
+       mm6 -> Y even, mm7 -> Y odd */\
+    "movq %%mm0, %%mm3;" /* Copy Cblue */\
+    "movq %%mm1, %%mm4;" /* Copy Cred */\
+    "movq %%mm2, %%mm5;" /* Copy Cgreen */\
+\
+    "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\
+    "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\
+\
+    "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\
+    "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\
+\
+    "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\
+    "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\
+\
+    /* Limit RGB even to 0..255 */\
+    "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0  B6 B4 B2 B0 */\
+    "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0  R6 R4 R2 R0 */\
+    "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0  G6 G4 G2 G0 */\
+\
+    /* Limit RGB odd to 0..255 */\
+    "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1  B7 B5 B3 B1 */\
+    "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1  R7 R5 R3 R1 */\
+    "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1  G7 G5 G3 G1 */\
+\
+    /* Interleave RGB even and odd */\
+    "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\
+    "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\
+    "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\
+
+
+#define YUV422_UNSHIFT                   \
+    if(c->srcFormat == PIX_FMT_YUV422P){ \
+        srcStride[1] *= 2;               \
+        srcStride[2] *= 2;               \
+    }                                    \
+
+#define YUV2RGB_LOOP(depth)                                   \
+    h_size= (c->dstW+7)&~7;                                   \
+    if(h_size*depth > FFABS(dstStride[0])) h_size-=8;         \
+\
+    __asm__ volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );    \
+    for (y= 0; y<srcSliceH; y++ ) {                           \
+        uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0]; \
+        uint8_t *py = src[0] + y*srcStride[0];                \
+        uint8_t *pu = src[1] + (y>>1)*srcStride[1];           \
+        uint8_t *pv = src[2] + (y>>1)*srcStride[2];           \
+        x86_reg index= -h_size/2;                                \
+
+#define YUV2RGB_INIT                                                       \
+        /* This MMX assembly code deals with a SINGLE scan line at a time, \
+         * it converts 8 pixels in each iteration. */                      \
+        __asm__ volatile (                                                 \
+        /* load data for start of next scan line */                        \
+        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
+        "movd    (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
+        "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+        /*                                                                 \
+        ".balign 16     \n\t"                                              \
+        */                                                                 \
+        "1:             \n\t"                                              \
+        /* No speed difference on my p3@500 with prefetch,                 \
+         * if it is faster for anyone with -benchmark then tell me.        \
+        PREFETCH" 64(%0) \n\t"                                             \
+        PREFETCH" 64(%1) \n\t"                                             \
+        PREFETCH" 64(%2) \n\t"                                             \
+        */                                                                 \
+
+#define YUV2RGB_ENDLOOP(depth) \
+        "add $"AV_STRINGIFY(depth*8)", %1    \n\t" \
+        "add                       $4, %0    \n\t" \
+        " js                       1b        \n\t" \
+
+#define YUV2RGB_OPERANDS \
+        : "+r" (index), "+r" (image) \
+        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index) \
+        ); \
+    } \
+    __asm__ volatile (EMMS); \
+    return srcSliceH; \
+
+#define YUV2RGB_OPERANDS_ALPHA \
+        : "+r" (index), "+r" (image) \
+        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index), "r" (pa - 2*index) \
+        ); \
+    } \
+    __asm__ volatile (EMMS); \
+    return srcSliceH; \
+
+static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
+    int y, h_size;
+
+    YUV422_UNSHIFT
+    YUV2RGB_LOOP(2)
+
+        c->blueDither= ff_dither8[y&1];
+        c->greenDither= ff_dither4[y&1];
+        c->redDither= ff_dither8[(y+1)&1];
+
+        YUV2RGB_INIT
+        YUV2RGB
+
+#ifdef DITHER1XBPP
+        "paddusb "BLUE_DITHER"(%4), %%mm0;"
+        "paddusb "GREEN_DITHER"(%4), %%mm2;"
+        "paddusb "RED_DITHER"(%4), %%mm1;"
+#endif
+        /* mask unneeded bits off */
+        "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
+        "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
+        "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
+
+        "psrlw   $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
+        "pxor %%mm4, %%mm4;" /* zero mm4 */
+
+        "movq %%mm0, %%mm5;" /* Copy B7-B0 */
+        "movq %%mm2, %%mm7;" /* Copy G7-G0 */
+
+        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
+        "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
+        "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
+
+        "psllw  $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
+        "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
+
+        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+        MOVNTQ "      %%mm0, (%1);" /* store pixel 0-3 */
+
+        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
+        "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
+        "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
+
+        "psllw        $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
+        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+
+        "por       %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
+        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+
+        MOVNTQ "   %%mm5, 8 (%1);" /* store pixel 4-7 */
+
+    YUV2RGB_ENDLOOP(2)
+    YUV2RGB_OPERANDS
+}
+
+static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
+    int y, h_size;
+
+    YUV422_UNSHIFT
+    YUV2RGB_LOOP(2)
+
+        c->blueDither= ff_dither8[y&1];
+        c->greenDither= ff_dither8[y&1];
+        c->redDither= ff_dither8[(y+1)&1];
+
+        YUV2RGB_INIT
+        YUV2RGB
+
+#ifdef DITHER1XBPP
+        "paddusb "BLUE_DITHER"(%4), %%mm0  \n\t"
+        "paddusb "GREEN_DITHER"(%4), %%mm2  \n\t"
+        "paddusb "RED_DITHER"(%4), %%mm1  \n\t"
+#endif
+
+        /* mask unneeded bits off */
+        "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
+        "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
+        "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
+
+        "psrlw   $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
+        "psrlw   $1, %%mm1;" /* 0_r7r6r5  r4r3_0_0 0_r7r6r5 r4r3_0_0 */
+        "pxor %%mm4, %%mm4;" /* zero mm4 */
+
+        "movq %%mm0, %%mm5;" /* Copy B7-B0 */
+        "movq %%mm2, %%mm7;" /* Copy G7-G0 */
+
+        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
+        "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
+        "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
+
+        "psllw  $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
+        "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
+
+        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+        MOVNTQ "      %%mm0, (%1);"  /* store pixel 0-3 */
+
+        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
+        "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
+        "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
+
+        "psllw        $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
+        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+
+        "por       %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
+        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+
+        MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
+
+    YUV2RGB_ENDLOOP(2)
+    YUV2RGB_OPERANDS
+}
+
+static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
+    int y, h_size;
+
+    YUV422_UNSHIFT
+    YUV2RGB_LOOP(3)
+
+        YUV2RGB_INIT
+        YUV2RGB
+        /* mm0=B, %%mm2=G, %%mm1=R */
+#if HAVE_MMX2
+        "movq "MANGLE(ff_M24A)", %%mm4     \n\t"
+        "movq "MANGLE(ff_M24C)", %%mm7     \n\t"
+        "pshufw $0x50, %%mm0, %%mm5     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */
+        "pshufw $0x50, %%mm2, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */
+        "pshufw $0x00, %%mm1, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */
+
+        "pand   %%mm4, %%mm5            \n\t" /*    B2        B1       B0 */
+        "pand   %%mm4, %%mm3            \n\t" /*    G2        G1       G0 */
+        "pand   %%mm7, %%mm6            \n\t" /*       R1        R0       */
+
+        "psllq     $8, %%mm3            \n\t" /* G2        G1       G0    */
+        "por    %%mm5, %%mm6            \n\t"
+        "por    %%mm3, %%mm6            \n\t"
+        MOVNTQ" %%mm6, (%1)             \n\t"
+
+        "psrlq     $8, %%mm2            \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */
+        "pshufw $0xA5, %%mm0, %%mm5     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */
+        "pshufw $0x55, %%mm2, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */
+        "pshufw $0xA5, %%mm1, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */
+
+        "pand "MANGLE(ff_M24B)", %%mm5     \n\t" /* B5       B4        B3    */
+        "pand          %%mm7, %%mm3     \n\t" /*       G4        G3       */
+        "pand          %%mm4, %%mm6     \n\t" /*    R4        R3       R2 */
+
+        "por    %%mm5, %%mm3            \n\t" /* B5    G4 B4     G3 B3    */
+        "por    %%mm3, %%mm6            \n\t"
+        MOVNTQ" %%mm6, 8(%1)            \n\t"
+
+        "pshufw $0xFF, %%mm0, %%mm5     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */
+        "pshufw $0xFA, %%mm2, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */
+        "pshufw $0xFA, %%mm1, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */
+        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+
+        "pand          %%mm7, %%mm5     \n\t" /*       B7        B6       */
+        "pand          %%mm4, %%mm3     \n\t" /*    G7        G6       G5 */
+        "pand "MANGLE(ff_M24B)", %%mm6     \n\t" /* R7       R6        R5    */
+        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+\
+        "por          %%mm5, %%mm3      \n\t"
+        "por          %%mm3, %%mm6      \n\t"
+        MOVNTQ"       %%mm6, 16(%1)     \n\t"
+        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+        "pxor         %%mm4, %%mm4      \n\t"
+
+#else
+
+        "pxor      %%mm4, %%mm4     \n\t"
+        "movq      %%mm0, %%mm5     \n\t" /* B */
+        "movq      %%mm1, %%mm6     \n\t" /* R */
+        "punpcklbw %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */
+        "punpcklbw %%mm4, %%mm1     \n\t" /* 0R0R0R0R 0 */
+        "punpckhbw %%mm2, %%mm5     \n\t" /* GBGBGBGB 2 */
+        "punpckhbw %%mm4, %%mm6     \n\t" /* 0R0R0R0R 2 */
+        "movq      %%mm0, %%mm7     \n\t" /* GBGBGBGB 0 */
+        "movq      %%mm5, %%mm3     \n\t" /* GBGBGBGB 2 */
+        "punpcklwd %%mm1, %%mm7     \n\t" /* 0RGB0RGB 0 */
+        "punpckhwd %%mm1, %%mm0     \n\t" /* 0RGB0RGB 1 */
+        "punpcklwd %%mm6, %%mm5     \n\t" /* 0RGB0RGB 2 */
+        "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */
+
+        "movq      %%mm7, %%mm2     \n\t" /* 0RGB0RGB 0 */
+        "movq      %%mm0, %%mm6     \n\t" /* 0RGB0RGB 1 */
+        "movq      %%mm5, %%mm1     \n\t" /* 0RGB0RGB 2 */
+        "movq      %%mm3, %%mm4     \n\t" /* 0RGB0RGB 3 */
+
+        "psllq       $40, %%mm7     \n\t" /* RGB00000 0 */
+        "psllq       $40, %%mm0     \n\t" /* RGB00000 1 */
+        "psllq       $40, %%mm5     \n\t" /* RGB00000 2 */
+        "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */
+
+        "punpckhdq %%mm2, %%mm7     \n\t" /* 0RGBRGB0 0 */
+        "punpckhdq %%mm6, %%mm0     \n\t" /* 0RGBRGB0 1 */
+        "punpckhdq %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */
+        "punpckhdq %%mm4, %%mm3     \n\t" /* 0RGBRGB0 3 */
+
+        "psrlq        $8, %%mm7     \n\t" /* 00RGBRGB 0 */
+        "movq      %%mm0, %%mm6     \n\t" /* 0RGBRGB0 1 */
+        "psllq       $40, %%mm0     \n\t" /* GB000000 1 */
+        "por       %%mm0, %%mm7     \n\t" /* GBRGBRGB 0 */
+        MOVNTQ"    %%mm7, (%1)      \n\t"
+
+        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+
+        "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */
+        "movq      %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */
+        "psllq       $24, %%mm5     \n\t" /* BRGB0000 2 */
+        "por       %%mm5, %%mm6     \n\t" /* BRGBRGBR 1 */
+        MOVNTQ"    %%mm6, 8(%1)     \n\t"
+
+        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+
+        "psrlq       $40, %%mm1     \n\t" /* 000000RG 2 */
+        "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */
+        "por       %%mm3, %%mm1     \n\t" /* RGBRGBRG 2 */
+        MOVNTQ"    %%mm1, 16(%1)    \n\t"
+
+        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+        "pxor      %%mm4, %%mm4     \n\t"
+#endif
+
+    YUV2RGB_ENDLOOP(3)
+    YUV2RGB_OPERANDS
+}
+
+#define RGB_PLANAR2PACKED32                                             \
+    /* convert RGB plane to RGB packed format,                          \
+       mm0 ->  B, mm1 -> R, mm2 -> G, mm3 -> A,                         \
+       mm4 -> GB, mm5 -> AR pixel 4-7,                                  \
+       mm6 -> GB, mm7 -> AR pixel 0-3 */                                \
+    "movq      %%mm0, %%mm6;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
+    "movq      %%mm1, %%mm7;"   /* R7 R6 R5 R4 R3 R2 R1 R0 */           \
+\
+    "movq      %%mm0, %%mm4;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
+    "movq      %%mm1, %%mm5;"   /* R7 R6 R5 R4 R3 R2 R1 R0 */           \
+\
+    "punpcklbw %%mm2, %%mm6;"   /* G3 B3 G2 B2 G1 B1 G0 B0 */           \
+    "punpcklbw %%mm3, %%mm7;"   /* A3 R3 A2 R2 A1 R1 A0 R0 */           \
+\
+    "punpcklwd %%mm7, %%mm6;"   /* A1 R1 B1 G1 A0 R0 B0 G0 */           \
+    MOVNTQ "   %%mm6, (%1);"    /* Store ARGB1 ARGB0 */                 \
+\
+    "movq      %%mm0, %%mm6;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
+    "punpcklbw %%mm2, %%mm6;"   /* G3 B3 G2 B2 G1 B1 G0 B0 */           \
+\
+    "punpckhwd %%mm7, %%mm6;"   /* A3 R3 G3 B3 A2 R2 B3 G2 */           \
+    MOVNTQ "   %%mm6, 8 (%1);"  /* Store ARGB3 ARGB2 */                 \
+\
+    "punpckhbw %%mm2, %%mm4;"   /* G7 B7 G6 B6 G5 B5 G4 B4 */           \
+    "punpckhbw %%mm3, %%mm5;"   /* A7 R7 A6 R6 A5 R5 A4 R4 */           \
+\
+    "punpcklwd %%mm5, %%mm4;"   /* A5 R5 B5 G5 A4 R4 B4 G4 */           \
+    MOVNTQ "   %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */                 \
+\
+    "movq      %%mm0, %%mm4;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
+    "punpckhbw %%mm2, %%mm4;"   /* G7 B7 G6 B6 G5 B5 G4 B4 */           \
+\
+    "punpckhwd %%mm5, %%mm4;"   /* A7 R7 G7 B7 A6 R6 B6 G6 */           \
+    MOVNTQ "   %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */                 \
+\
+    "movd 4 (%2, %0), %%mm0;"   /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
+    "movd 4 (%3, %0), %%mm1;"   /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
+\
+    "pxor         %%mm4, %%mm4;" /* zero mm4 */                         \
+    "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+
+static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
+    int y, h_size;
+
+    YUV422_UNSHIFT
+    YUV2RGB_LOOP(4)
+
+        YUV2RGB_INIT
+        YUV2RGB
+        "pcmpeqd   %%mm3, %%mm3;"   /* fill mm3 */
+        RGB_PLANAR2PACKED32
+
+    YUV2RGB_ENDLOOP(4)
+    YUV2RGB_OPERANDS
+}
+
+static inline int RENAME(yuva420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                                        int srcSliceH, uint8_t* dst[], int dstStride[]){
+#if HAVE_7REGS
+    int y, h_size;
+
+    YUV2RGB_LOOP(4)
+
+        uint8_t *pa = src[3] + y*srcStride[3];
+        YUV2RGB_INIT
+        YUV2RGB
+        "movq     (%6, %0, 2), %%mm3;"            /* Load 8 A A7 A6 A5 A4 A3 A2 A1 A0 */
+        RGB_PLANAR2PACKED32
+
+    YUV2RGB_ENDLOOP(4)
+    YUV2RGB_OPERANDS_ALPHA
+#endif
+}
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 112d4a375c..d8438c0c66 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -3,7 +3,6 @@
  *
  * Copyright (C) 2009 Konstantin Shishkov
  *
- * MMX/MMX2 template stuff (needed for fast movntq support),
  * 1,4,8bpp support and context / deglobalize stuff
  * by Michael Niedermayer (michaelni@gmx.at)
  *
@@ -35,37 +34,10 @@
 #include "swscale_internal.h"
 #include "libavutil/x86_cpu.h"
 
-#define DITHER1XBPP // only for MMX
-
 extern const uint8_t dither_8x8_32[8][8];
 extern const uint8_t dither_8x8_73[8][8];
 extern const uint8_t dither_8x8_220[8][8];
 
-#if HAVE_MMX && CONFIG_GPL
-
-/* hope these constant values are cache line aligned */
-DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw)   = 0x00ff00ff00ff00ffULL;
-DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
-DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
-
-//MMX versions
-#undef RENAME
-#undef HAVE_MMX2
-#undef HAVE_AMD3DNOW
-#define HAVE_MMX2 0
-#define HAVE_AMD3DNOW 0
-#define RENAME(a) a ## _MMX
-#include "yuv2rgb_template.c"
-
-//MMX2 versions
-#undef RENAME
-#undef HAVE_MMX2
-#define HAVE_MMX2 1
-#define RENAME(a) a ## _MMX2
-#include "yuv2rgb_template.c"
-
-#endif /* HAVE_MMX && CONFIG_GPL */
-
 const int32_t ff_yuv2rgb_coeffs[8][4] = {
     {117504, 138453, 13954, 34903}, /* no sequence_display_extension */
     {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
@@ -504,30 +476,7 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
 {
     SwsFunc t = NULL;
 #if (HAVE_MMX2 || HAVE_MMX) && CONFIG_GPL
-    if (c->flags & SWS_CPU_CAPS_MMX2) {
-        switch (c->dstFormat) {
-        case PIX_FMT_RGB32:
-            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P){
-                if (HAVE_7REGS) return yuva420_rgb32_MMX2;
-                break;
-            }else return yuv420_rgb32_MMX2;
-        case PIX_FMT_BGR24:  return yuv420_rgb24_MMX2;
-        case PIX_FMT_RGB565: return yuv420_rgb16_MMX2;
-        case PIX_FMT_RGB555: return yuv420_rgb15_MMX2;
-        }
-    }
-    if (c->flags & SWS_CPU_CAPS_MMX) {
-        switch (c->dstFormat) {
-        case PIX_FMT_RGB32:
-            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P){
-                if (HAVE_7REGS) return yuva420_rgb32_MMX;
-                break;
-            }else return yuv420_rgb32_MMX;
-        case PIX_FMT_BGR24:  return yuv420_rgb24_MMX;
-        case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
-        case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
-        }
-    }
+     t = ff_yuv2rgb_init_mmx(c);
 #endif
 #if HAVE_VIS
     t = ff_yuv2rgb_init_vis(c);
diff --git a/libswscale/yuv2rgb_altivec.c b/libswscale/yuv2rgb_altivec.c
deleted file mode 100644
index e029805b3c..0000000000
--- a/libswscale/yuv2rgb_altivec.c
+++ /dev/null
@@ -1,962 +0,0 @@
-/*
- * AltiVec acceleration for colorspace conversion
- *
- * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/*
-Convert I420 YV12 to RGB in various formats,
-  it rejects images that are not in 420 formats,
-  it rejects images that don't have widths of multiples of 16,
-  it rejects images that don't have heights of multiples of 2.
-Reject defers to C simulation code.
-
-Lots of optimizations to be done here.
-
-1. Need to fix saturation code. I just couldn't get it to fly with packs
-   and adds, so we currently use max/min to clip.
-
-2. The inefficient use of chroma loading needs a bit of brushing up.
-
-3. Analysis of pipeline stalls needs to be done. Use shark to identify
-   pipeline stalls.
-
-
-MODIFIED to calculate coeffs from currently selected color space.
-MODIFIED core to be a macro where you specify the output format.
-ADDED UYVY conversion which is never called due to some thing in swscale.
-CORRECTED algorithim selection to be strict on input formats.
-ADDED runtime detection of AltiVec.
-
-ADDED altivec_yuv2packedX vertical scl + RGB converter
-
-March 27,2004
-PERFORMANCE ANALYSIS
-
-The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
-used as test.
-The AltiVec version uses 10% of the processor or ~100Mips for D1 video
-same sequence.
-
-720 * 480 * 30  ~10MPS
-
-so we have roughly 10 clocks per pixel. This is too high, something has
-to be wrong.
-
-OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
-need for vec_min.
-
-OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
-the input video frame, it was just decompressed so it probably resides in L1
-caches. However, we are creating the output video stream. This needs to use the
-DSTST instruction to optimize for the cache. We couple this with the fact that
-we are not going to be visiting the input buffer again so we mark it Least
-Recently Used. This shaves 25% of the processor cycles off.
-
-Now memcpy is the largest mips consumer in the system, probably due
-to the inefficient X11 stuff.
-
-GL libraries seem to be very slow on this machine 1.33Ghz PB running
-Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
-a versioning issue, however I have libGL.1.2.dylib for both
-machines. (We need to figure this out now.)
-
-GL2 libraries work now with patch for RGB32.
-
-NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
-
-Integrated luma prescaling adjustment for saturation/contrast/brightness
-adjustment.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-#include <assert.h>
-#include "config.h"
-#include "rgb2rgb.h"
-#include "swscale.h"
-#include "swscale_internal.h"
-
-#undef PROFILE_THE_BEAST
-#undef INC_SCALING
-
-typedef unsigned char ubyte;
-typedef signed char   sbyte;
-
-
-/* RGB interleaver, 16 planar pels 8-bit samples per channel in
-   homogeneous vector registers x0,x1,x2 are interleaved with the
-   following technique:
-
-      o0 = vec_mergeh (x0,x1);
-      o1 = vec_perm (o0, x2, perm_rgb_0);
-      o2 = vec_perm (o0, x2, perm_rgb_1);
-      o3 = vec_mergel (x0,x1);
-      o4 = vec_perm (o3,o2,perm_rgb_2);
-      o5 = vec_perm (o3,o2,perm_rgb_3);
-
-  perm_rgb_0:   o0(RG).h v1(B) --> o1*
-              0   1  2   3   4
-             rgbr|gbrg|brgb|rgbr
-             0010 0100 1001 0010
-             0102 3145 2673 894A
-
-  perm_rgb_1:   o0(RG).h v1(B) --> o2
-              0   1  2   3   4
-             gbrg|brgb|bbbb|bbbb
-             0100 1001 1111 1111
-             B5CD 6EF7 89AB CDEF
-
-  perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
-              0   1  2   3   4
-             gbrg|brgb|rgbr|gbrg
-             1111 1111 0010 0100
-             89AB CDEF 0182 3945
-
-  perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
-              0   1  2   3   4
-             brgb|rgbr|gbrg|brgb
-             1001 0010 0100 1001
-             a67b 89cA BdCD eEFf
-
-*/
-static
-const vector unsigned char
-  perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
-                0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
-  perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
-                0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
-  perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
-                0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
-  perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
-                0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
-
-#define vec_merge3(x2,x1,x0,y0,y1,y2)       \
-do {                                        \
-    __typeof__(x0) o0,o2,o3;                \
-        o0 = vec_mergeh (x0,x1);            \
-        y0 = vec_perm (o0, x2, perm_rgb_0); \
-        o2 = vec_perm (o0, x2, perm_rgb_1); \
-        o3 = vec_mergel (x0,x1);            \
-        y1 = vec_perm (o3,o2,perm_rgb_2);   \
-        y2 = vec_perm (o3,o2,perm_rgb_3);   \
-} while(0)
-
-#define vec_mstbgr24(x0,x1,x2,ptr)      \
-do {                                    \
-    __typeof__(x0) _0,_1,_2;            \
-    vec_merge3 (x0,x1,x2,_0,_1,_2);     \
-    vec_st (_0, 0, ptr++);              \
-    vec_st (_1, 0, ptr++);              \
-    vec_st (_2, 0, ptr++);              \
-}  while (0);
-
-#define vec_mstrgb24(x0,x1,x2,ptr)      \
-do {                                    \
-    __typeof__(x0) _0,_1,_2;            \
-    vec_merge3 (x2,x1,x0,_0,_1,_2);     \
-    vec_st (_0, 0, ptr++);              \
-    vec_st (_1, 0, ptr++);              \
-    vec_st (_2, 0, ptr++);              \
-}  while (0);
-
-/* pack the pixels in rgb0 format
-   msb R
-   lsb 0
-*/
-#define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
-do {                                                                          \
-    T _0,_1,_2,_3;                                                            \
-    _0 = vec_mergeh (x0,x1);                                                  \
-    _1 = vec_mergeh (x2,x3);                                                  \
-    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
-    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
-    vec_st (_2, 0*16, (T *)ptr);                                              \
-    vec_st (_3, 1*16, (T *)ptr);                                              \
-    _0 = vec_mergel (x0,x1);                                                  \
-    _1 = vec_mergel (x2,x3);                                                  \
-    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
-    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
-    vec_st (_2, 2*16, (T *)ptr);                                              \
-    vec_st (_3, 3*16, (T *)ptr);                                              \
-    ptr += 4;                                                                 \
-}  while (0);
-
-/*
-
-  | 1     0       1.4021   | | Y |
-  | 1    -0.3441 -0.7142   |x| Cb|
-  | 1     1.7718  0        | | Cr|
-
-
-  Y:      [-128 127]
-  Cb/Cr : [-128 127]
-
-  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
-
-*/
-
-
-
-
-#define vec_unh(x) \
-    (vector signed short) \
-        vec_perm(x,(__typeof__(x)){0}, \
-                 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
-                                         0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
-#define vec_unl(x) \
-    (vector signed short) \
-        vec_perm(x,(__typeof__(x)){0}, \
-                 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
-                                         0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
-
-#define vec_clip_s16(x) \
-    vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
-                         ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
-
-#define vec_packclp(x,y) \
-    (vector unsigned char)vec_packs \
-        ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
-         (vector unsigned short)vec_max (y,((vector signed short) {0})))
-
-//#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
-
-
-static inline void cvtyuvtoRGB (SwsContext *c,
-                                vector signed short Y, vector signed short U, vector signed short V,
-                                vector signed short *R, vector signed short *G, vector signed short *B)
-{
-    vector signed   short vx,ux,uvx;
-
-    Y = vec_mradds (Y, c->CY, c->OY);
-    U  = vec_sub (U,(vector signed short)
-                    vec_splat((vector signed short){128},0));
-    V  = vec_sub (V,(vector signed short)
-                    vec_splat((vector signed short){128},0));
-
-    //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
-    ux = vec_sl (U, c->CSHIFT);
-    *B = vec_mradds (ux, c->CBU, Y);
-
-    // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
-    vx = vec_sl (V, c->CSHIFT);
-    *R = vec_mradds (vx, c->CRV, Y);
-
-    // uvx = ((CGU*u) + (CGV*v))>>15;
-    uvx = vec_mradds (U, c->CGU, Y);
-    *G  = vec_mradds (V, c->CGV, uvx);
-}
-
-
-/*
-  ------------------------------------------------------------------------------
-  CS converters
-  ------------------------------------------------------------------------------
-*/
-
-
-#define DEFCSP420_CVT(name,out_pixels)                                  \
-static int altivec_##name (SwsContext *c,                               \
-                           unsigned char **in, int *instrides,          \
-                           int srcSliceY,        int srcSliceH,         \
-                           unsigned char **oplanes, int *outstrides)    \
-{                                                                       \
-    int w = c->srcW;                                                    \
-    int h = srcSliceH;                                                  \
-    int i,j;                                                            \
-    int instrides_scl[3];                                               \
-    vector unsigned char y0,y1;                                         \
-                                                                        \
-    vector signed char  u,v;                                            \
-                                                                        \
-    vector signed short Y0,Y1,Y2,Y3;                                    \
-    vector signed short U,V;                                            \
-    vector signed short vx,ux,uvx;                                      \
-    vector signed short vx0,ux0,uvx0;                                   \
-    vector signed short vx1,ux1,uvx1;                                   \
-    vector signed short R0,G0,B0;                                       \
-    vector signed short R1,G1,B1;                                       \
-    vector unsigned char R,G,B;                                         \
-                                                                        \
-    vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
-    vector unsigned char align_perm;                                    \
-                                                                        \
-    vector signed short                                                 \
-        lCY  = c->CY,                                                   \
-        lOY  = c->OY,                                                   \
-        lCRV = c->CRV,                                                  \
-        lCBU = c->CBU,                                                  \
-        lCGU = c->CGU,                                                  \
-        lCGV = c->CGV;                                                  \
-                                                                        \
-    vector unsigned short lCSHIFT = c->CSHIFT;                          \
-                                                                        \
-    ubyte *y1i   = in[0];                                               \
-    ubyte *y2i   = in[0]+instrides[0];                                  \
-    ubyte *ui    = in[1];                                               \
-    ubyte *vi    = in[2];                                               \
-                                                                        \
-    vector unsigned char *oute                                          \
-        = (vector unsigned char *)                                      \
-            (oplanes[0]+srcSliceY*outstrides[0]);                       \
-    vector unsigned char *outo                                          \
-        = (vector unsigned char *)                                      \
-            (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
-                                                                        \
-                                                                        \
-    instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
-    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
-    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
-                                                                        \
-                                                                        \
-    for (i=0;i<h/2;i++) {                                               \
-        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
-        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
-                                                                        \
-        for (j=0;j<w/16;j++) {                                          \
-                                                                        \
-            y1ivP = (vector unsigned char *)y1i;                        \
-            y2ivP = (vector unsigned char *)y2i;                        \
-            uivP  = (vector unsigned char *)ui;                         \
-            vivP  = (vector unsigned char *)vi;                         \
-                                                                        \
-            align_perm = vec_lvsl (0, y1i);                             \
-            y0 = (vector unsigned char)                                 \
-                 vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
-                                                                        \
-            align_perm = vec_lvsl (0, y2i);                             \
-            y1 = (vector unsigned char)                                 \
-                 vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
-                                                                        \
-            align_perm = vec_lvsl (0, ui);                              \
-            u = (vector signed char)                                    \
-                vec_perm (uivP[0], uivP[1], align_perm);                \
-                                                                        \
-            align_perm = vec_lvsl (0, vi);                              \
-            v = (vector signed char)                                    \
-                vec_perm (vivP[0], vivP[1], align_perm);                \
-                                                                        \
-            u  = (vector signed char)                                   \
-                 vec_sub (u,(vector signed char)                        \
-                          vec_splat((vector signed char){128},0));      \
-            v  = (vector signed char)                                   \
-                 vec_sub (v,(vector signed char)                        \
-                          vec_splat((vector signed char){128},0));      \
-                                                                        \
-            U  = vec_unpackh (u);                                       \
-            V  = vec_unpackh (v);                                       \
-                                                                        \
-                                                                        \
-            Y0 = vec_unh (y0);                                          \
-            Y1 = vec_unl (y0);                                          \
-            Y2 = vec_unh (y1);                                          \
-            Y3 = vec_unl (y1);                                          \
-                                                                        \
-            Y0 = vec_mradds (Y0, lCY, lOY);                             \
-            Y1 = vec_mradds (Y1, lCY, lOY);                             \
-            Y2 = vec_mradds (Y2, lCY, lOY);                             \
-            Y3 = vec_mradds (Y3, lCY, lOY);                             \
-                                                                        \
-            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
-            ux = vec_sl (U, lCSHIFT);                                   \
-            ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
-            ux0  = vec_mergeh (ux,ux);                                  \
-            ux1  = vec_mergel (ux,ux);                                  \
-                                                                        \
-            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
-            vx = vec_sl (V, lCSHIFT);                                   \
-            vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
-            vx0  = vec_mergeh (vx,vx);                                  \
-            vx1  = vec_mergel (vx,vx);                                  \
-                                                                        \
-            /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
-            uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
-            uvx = vec_mradds (V, lCGV, uvx);                            \
-            uvx0 = vec_mergeh (uvx,uvx);                                \
-            uvx1 = vec_mergel (uvx,uvx);                                \
-                                                                        \
-            R0 = vec_add (Y0,vx0);                                      \
-            G0 = vec_add (Y0,uvx0);                                     \
-            B0 = vec_add (Y0,ux0);                                      \
-            R1 = vec_add (Y1,vx1);                                      \
-            G1 = vec_add (Y1,uvx1);                                     \
-            B1 = vec_add (Y1,ux1);                                      \
-                                                                        \
-            R  = vec_packclp (R0,R1);                                   \
-            G  = vec_packclp (G0,G1);                                   \
-            B  = vec_packclp (B0,B1);                                   \
-                                                                        \
-            out_pixels(R,G,B,oute);                                     \
-                                                                        \
-            R0 = vec_add (Y2,vx0);                                      \
-            G0 = vec_add (Y2,uvx0);                                     \
-            B0 = vec_add (Y2,ux0);                                      \
-            R1 = vec_add (Y3,vx1);                                      \
-            G1 = vec_add (Y3,uvx1);                                     \
-            B1 = vec_add (Y3,ux1);                                      \
-            R  = vec_packclp (R0,R1);                                   \
-            G  = vec_packclp (G0,G1);                                   \
-            B  = vec_packclp (B0,B1);                                   \
-                                                                        \
-                                                                        \
-            out_pixels(R,G,B,outo);                                     \
-                                                                        \
-            y1i  += 16;                                                 \
-            y2i  += 16;                                                 \
-            ui   += 8;                                                  \
-            vi   += 8;                                                  \
-                                                                        \
-        }                                                               \
-                                                                        \
-        outo  += (outstrides[0])>>4;                                    \
-        oute  += (outstrides[0])>>4;                                    \
-                                                                        \
-        ui    += instrides_scl[1];                                      \
-        vi    += instrides_scl[2];                                      \
-        y1i   += instrides_scl[0];                                      \
-        y2i   += instrides_scl[0];                                      \
-    }                                                                   \
-    return srcSliceH;                                                   \
-}
-
-
-#define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
-#define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
-#define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
-#define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
-#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
-#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
-
-DEFCSP420_CVT (yuv2_abgr, out_abgr)
-#if 1
-DEFCSP420_CVT (yuv2_bgra, out_bgra)
-#else
-static int altivec_yuv2_bgra32 (SwsContext *c,
-                                unsigned char **in, int *instrides,
-                                int srcSliceY,        int srcSliceH,
-                                unsigned char **oplanes, int *outstrides)
-{
-    int w = c->srcW;
-    int h = srcSliceH;
-    int i,j;
-    int instrides_scl[3];
-    vector unsigned char y0,y1;
-
-    vector signed char  u,v;
-
-    vector signed short Y0,Y1,Y2,Y3;
-    vector signed short U,V;
-    vector signed short vx,ux,uvx;
-    vector signed short vx0,ux0,uvx0;
-    vector signed short vx1,ux1,uvx1;
-    vector signed short R0,G0,B0;
-    vector signed short R1,G1,B1;
-    vector unsigned char R,G,B;
-
-    vector unsigned char *uivP, *vivP;
-    vector unsigned char align_perm;
-
-    vector signed short
-        lCY  = c->CY,
-        lOY  = c->OY,
-        lCRV = c->CRV,
-        lCBU = c->CBU,
-        lCGU = c->CGU,
-        lCGV = c->CGV;
-
-    vector unsigned short lCSHIFT = c->CSHIFT;
-
-    ubyte *y1i   = in[0];
-    ubyte *y2i   = in[0]+w;
-    ubyte *ui    = in[1];
-    ubyte *vi    = in[2];
-
-    vector unsigned char *oute
-        = (vector unsigned char *)
-          (oplanes[0]+srcSliceY*outstrides[0]);
-    vector unsigned char *outo
-        = (vector unsigned char *)
-          (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
-
-
-    instrides_scl[0] = instrides[0];
-    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
-    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
-
-
-    for (i=0;i<h/2;i++) {
-        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
-        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
-
-        for (j=0;j<w/16;j++) {
-
-            y0 = vec_ldl (0,y1i);
-            y1 = vec_ldl (0,y2i);
-            uivP = (vector unsigned char *)ui;
-            vivP = (vector unsigned char *)vi;
-
-            align_perm = vec_lvsl (0, ui);
-            u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
-
-            align_perm = vec_lvsl (0, vi);
-            v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
-            u  = (vector signed char)
-                 vec_sub (u,(vector signed char)
-                          vec_splat((vector signed char){128},0));
-
-            v  = (vector signed char)
-                 vec_sub (v, (vector signed char)
-                          vec_splat((vector signed char){128},0));
-
-            U  = vec_unpackh (u);
-            V  = vec_unpackh (v);
-
-
-            Y0 = vec_unh (y0);
-            Y1 = vec_unl (y0);
-            Y2 = vec_unh (y1);
-            Y3 = vec_unl (y1);
-
-            Y0 = vec_mradds (Y0, lCY, lOY);
-            Y1 = vec_mradds (Y1, lCY, lOY);
-            Y2 = vec_mradds (Y2, lCY, lOY);
-            Y3 = vec_mradds (Y3, lCY, lOY);
-
-            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
-            ux = vec_sl (U, lCSHIFT);
-            ux = vec_mradds (ux, lCBU, (vector signed short){0});
-            ux0  = vec_mergeh (ux,ux);
-            ux1  = vec_mergel (ux,ux);
-
-            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
-            vx = vec_sl (V, lCSHIFT);
-            vx = vec_mradds (vx, lCRV, (vector signed short){0});
-            vx0  = vec_mergeh (vx,vx);
-            vx1  = vec_mergel (vx,vx);
-            /* uvx = ((CGU*u) + (CGV*v))>>15 */
-            uvx = vec_mradds (U, lCGU, (vector signed short){0});
-            uvx = vec_mradds (V, lCGV, uvx);
-            uvx0 = vec_mergeh (uvx,uvx);
-            uvx1 = vec_mergel (uvx,uvx);
-            R0 = vec_add (Y0,vx0);
-            G0 = vec_add (Y0,uvx0);
-            B0 = vec_add (Y0,ux0);
-            R1 = vec_add (Y1,vx1);
-            G1 = vec_add (Y1,uvx1);
-            B1 = vec_add (Y1,ux1);
-            R  = vec_packclp (R0,R1);
-            G  = vec_packclp (G0,G1);
-            B  = vec_packclp (B0,B1);
-
-            out_argb(R,G,B,oute);
-            R0 = vec_add (Y2,vx0);
-            G0 = vec_add (Y2,uvx0);
-            B0 = vec_add (Y2,ux0);
-            R1 = vec_add (Y3,vx1);
-            G1 = vec_add (Y3,uvx1);
-            B1 = vec_add (Y3,ux1);
-            R  = vec_packclp (R0,R1);
-            G  = vec_packclp (G0,G1);
-            B  = vec_packclp (B0,B1);
-
-            out_argb(R,G,B,outo);
-            y1i  += 16;
-            y2i  += 16;
-            ui   += 8;
-            vi   += 8;
-
-        }
-
-        outo  += (outstrides[0])>>4;
-        oute  += (outstrides[0])>>4;
-
-        ui    += instrides_scl[1];
-        vi    += instrides_scl[2];
-        y1i   += instrides_scl[0];
-        y2i   += instrides_scl[0];
-    }
-    return srcSliceH;
-}
-
-#endif
-
-
-DEFCSP420_CVT (yuv2_rgba, out_rgba)
-DEFCSP420_CVT (yuv2_argb, out_argb)
-DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
-DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
-
-
-// uyvy|uyvy|uyvy|uyvy
-// 0123 4567 89ab cdef
-static
-const vector unsigned char
-    demux_u = {0x10,0x00,0x10,0x00,
-               0x10,0x04,0x10,0x04,
-               0x10,0x08,0x10,0x08,
-               0x10,0x0c,0x10,0x0c},
-    demux_v = {0x10,0x02,0x10,0x02,
-               0x10,0x06,0x10,0x06,
-               0x10,0x0A,0x10,0x0A,
-               0x10,0x0E,0x10,0x0E},
-    demux_y = {0x10,0x01,0x10,0x03,
-               0x10,0x05,0x10,0x07,
-               0x10,0x09,0x10,0x0B,
-               0x10,0x0D,0x10,0x0F};
-
-/*
-  this is so I can play live CCIR raw video
-*/
-static int altivec_uyvy_rgb32 (SwsContext *c,
-                               unsigned char **in, int *instrides,
-                               int srcSliceY,        int srcSliceH,
-                               unsigned char **oplanes, int *outstrides)
-{
-    int w = c->srcW;
-    int h = srcSliceH;
-    int i,j;
-    vector unsigned char uyvy;
-    vector signed   short Y,U,V;
-    vector signed   short R0,G0,B0,R1,G1,B1;
-    vector unsigned char  R,G,B;
-    vector unsigned char *out;
-    ubyte *img;
-
-    img = in[0];
-    out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
-
-    for (i=0;i<h;i++) {
-        for (j=0;j<w/16;j++) {
-            uyvy = vec_ld (0, img);
-            U = (vector signed short)
-                vec_perm (uyvy, (vector unsigned char){0}, demux_u);
-
-            V = (vector signed short)
-                vec_perm (uyvy, (vector unsigned char){0}, demux_v);
-
-            Y = (vector signed short)
-                vec_perm (uyvy, (vector unsigned char){0}, demux_y);
-
-            cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
-
-            uyvy = vec_ld (16, img);
-            U = (vector signed short)
-                vec_perm (uyvy, (vector unsigned char){0}, demux_u);
-
-            V = (vector signed short)
-                vec_perm (uyvy, (vector unsigned char){0}, demux_v);
-
-            Y = (vector signed short)
-                vec_perm (uyvy, (vector unsigned char){0}, demux_y);
-
-            cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
-
-            R  = vec_packclp (R0,R1);
-            G  = vec_packclp (G0,G1);
-            B  = vec_packclp (B0,B1);
-
-            //      vec_mstbgr24 (R,G,B, out);
-            out_rgba (R,G,B,out);
-
-            img += 32;
-        }
-    }
-    return srcSliceH;
-}
-
-
-
-/* Ok currently the acceleration routine only supports
-   inputs of widths a multiple of 16
-   and heights a multiple 2
-
-   So we just fall back to the C codes for this.
-*/
-SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
-{
-    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
-        return NULL;
-
-    /*
-      and this seems not to matter too much I tried a bunch of
-      videos with abnormal widths and MPlayer crashes elsewhere.
-      mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
-      boom with X11 bad match.
-
-    */
-    if ((c->srcW & 0xf) != 0)    return NULL;
-
-    switch (c->srcFormat) {
-    case PIX_FMT_YUV410P:
-    case PIX_FMT_YUV420P:
-    /*case IMGFMT_CLPL:        ??? */
-    case PIX_FMT_GRAY8:
-    case PIX_FMT_NV12:
-    case PIX_FMT_NV21:
-        if ((c->srcH & 0x1) != 0)
-            return NULL;
-
-        switch(c->dstFormat){
-        case PIX_FMT_RGB24:
-            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
-            return altivec_yuv2_rgb24;
-        case PIX_FMT_BGR24:
-            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
-            return altivec_yuv2_bgr24;
-        case PIX_FMT_ARGB:
-            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
-            return altivec_yuv2_argb;
-        case PIX_FMT_ABGR:
-            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
-            return altivec_yuv2_abgr;
-        case PIX_FMT_RGBA:
-            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
-            return altivec_yuv2_rgba;
-        case PIX_FMT_BGRA:
-            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
-            return altivec_yuv2_bgra;
-        default: return NULL;
-        }
-        break;
-
-    case PIX_FMT_UYVY422:
-        switch(c->dstFormat){
-        case PIX_FMT_BGR32:
-            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
-            return altivec_uyvy_rgb32;
-        default: return NULL;
-        }
-        break;
-
-    }
-    return NULL;
-}
-
-void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
-{
-    union {
-        signed short tmp[8] __attribute__ ((aligned(16)));
-        vector signed short vec;
-    } buf;
-
-    buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
-    buf.tmp[1] =  -256*brightness;                                      //oy
-    buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
-    buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
-    buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
-    buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
-
-
-    c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
-    c->CY   = vec_splat ((vector signed short)buf.vec, 0);
-    c->OY   = vec_splat ((vector signed short)buf.vec, 1);
-    c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
-    c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
-    c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
-    c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
-#if 0
-    {
-    int i;
-    char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
-    for (i=0; i<6; i++)
-        printf("%s %d ", v[i],buf.tmp[i] );
-        printf("\n");
-    }
-#endif
-    return;
-}
-
-
-void
-ff_yuv2packedX_altivec(SwsContext *c,
-                     int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
-                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
-                     uint8_t *dest, int dstW, int dstY)
-{
-    int i,j;
-    vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
-    vector signed short R0,G0,B0,R1,G1,B1;
-
-    vector unsigned char R,G,B;
-    vector unsigned char *out,*nout;
-
-    vector signed short   RND = vec_splat_s16(1<<3);
-    vector unsigned short SCL = vec_splat_u16(4);
-    unsigned long scratch[16] __attribute__ ((aligned (16)));
-
-    vector signed short *YCoeffs, *CCoeffs;
-
-    YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
-    CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
-
-    out = (vector unsigned char *)dest;
-
-    for (i=0; i<dstW; i+=16){
-        Y0 = RND;
-        Y1 = RND;
-        /* extract 16 coeffs from lumSrc */
-        for (j=0; j<lumFilterSize; j++) {
-            X0 = vec_ld (0,  &lumSrc[j][i]);
-            X1 = vec_ld (16, &lumSrc[j][i]);
-            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
-            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
-        }
-
-        U = RND;
-        V = RND;
-        /* extract 8 coeffs from U,V */
-        for (j=0; j<chrFilterSize; j++) {
-            X  = vec_ld (0, &chrSrc[j][i/2]);
-            U  = vec_mradds (X, CCoeffs[j], U);
-            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
-            V  = vec_mradds (X, CCoeffs[j], V);
-        }
-
-        /* scale and clip signals */
-        Y0 = vec_sra (Y0, SCL);
-        Y1 = vec_sra (Y1, SCL);
-        U  = vec_sra (U,  SCL);
-        V  = vec_sra (V,  SCL);
-
-        Y0 = vec_clip_s16 (Y0);
-        Y1 = vec_clip_s16 (Y1);
-        U  = vec_clip_s16 (U);
-        V  = vec_clip_s16 (V);
-
-        /* now we have
-          Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
-          U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
-
-          Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
-          U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
-          V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
-        */
-
-        U0 = vec_mergeh (U,U);
-        V0 = vec_mergeh (V,V);
-
-        U1 = vec_mergel (U,U);
-        V1 = vec_mergel (V,V);
-
-        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
-        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
-
-        R  = vec_packclp (R0,R1);
-        G  = vec_packclp (G0,G1);
-        B  = vec_packclp (B0,B1);
-
-        switch(c->dstFormat) {
-            case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
-            case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
-            case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
-            case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
-            case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
-            case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
-            default:
-            {
-                /* If this is reached, the caller should have called yuv2packedXinC
-                   instead. */
-                static int printed_error_message;
-                if (!printed_error_message) {
-                    av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
-                           sws_format_name(c->dstFormat));
-                    printed_error_message=1;
-                }
-                return;
-            }
-        }
-    }
-
-    if (i < dstW) {
-        i -= 16;
-
-        Y0 = RND;
-        Y1 = RND;
-        /* extract 16 coeffs from lumSrc */
-        for (j=0; j<lumFilterSize; j++) {
-            X0 = vec_ld (0,  &lumSrc[j][i]);
-            X1 = vec_ld (16, &lumSrc[j][i]);
-            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
-            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
-        }
-
-        U = RND;
-        V = RND;
-        /* extract 8 coeffs from U,V */
-        for (j=0; j<chrFilterSize; j++) {
-            X  = vec_ld (0, &chrSrc[j][i/2]);
-            U  = vec_mradds (X, CCoeffs[j], U);
-            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
-            V  = vec_mradds (X, CCoeffs[j], V);
-        }
-
-        /* scale and clip signals */
-        Y0 = vec_sra (Y0, SCL);
-        Y1 = vec_sra (Y1, SCL);
-        U  = vec_sra (U,  SCL);
-        V  = vec_sra (V,  SCL);
-
-        Y0 = vec_clip_s16 (Y0);
-        Y1 = vec_clip_s16 (Y1);
-        U  = vec_clip_s16 (U);
-        V  = vec_clip_s16 (V);
-
-        /* now we have
-           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
-           U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
-
-           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
-           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
-           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
-        */
-
-        U0 = vec_mergeh (U,U);
-        V0 = vec_mergeh (V,V);
-
-        U1 = vec_mergel (U,U);
-        V1 = vec_mergel (V,V);
-
-        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
-        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
-
-        R  = vec_packclp (R0,R1);
-        G  = vec_packclp (G0,G1);
-        B  = vec_packclp (B0,B1);
-
-        nout = (vector unsigned char *)scratch;
-        switch(c->dstFormat) {
-            case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
-            case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
-            case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
-            case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
-            case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
-            case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
-            default:
-                /* Unreachable, I think. */
-                av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
-                       sws_format_name(c->dstFormat));
-                return;
-        }
-
-        memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
-    }
-
-}
diff --git a/libswscale/yuv2rgb_bfin.c b/libswscale/yuv2rgb_bfin.c
deleted file mode 100644
index 2cc6962d01..0000000000
--- a/libswscale/yuv2rgb_bfin.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
- *
- * Blackfin video color space converter operations
- * convert I420 YV12 to RGB in various formats
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-#include <assert.h>
-#include "config.h"
-#include <unistd.h>
-#include "rgb2rgb.h"
-#include "swscale.h"
-#include "swscale_internal.h"
-
-#ifdef __FDPIC__
-#define L1CODE __attribute__ ((l1_text))
-#else
-#define L1CODE
-#endif
-
-void ff_bfin_yuv2rgb555_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                             int w, uint32_t *coeffs) L1CODE;
-
-void ff_bfin_yuv2rgb565_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                             int w, uint32_t *coeffs) L1CODE;
-
-void ff_bfin_yuv2rgb24_line(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                            int w, uint32_t *coeffs) L1CODE;
-
-typedef void (* ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                            int w, uint32_t *coeffs);
-
-
-static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
-{
-    int oy;
-    oy      = c->yOffset&0xffff;
-    oy      = oy >> 3; // keep everything U8.0 for offset calculation
-
-    c->oc   = 128*0x01010101U;
-    c->oy   =  oy*0x01010101U;
-
-    /* copy 64bit vector coeffs down to 32bit vector coeffs */
-    c->cy  = c->yCoeff;
-    c->zero = 0;
-
-    if (rgb) {
-        c->crv = c->vrCoeff;
-        c->cbu = c->ubCoeff;
-        c->cgu = c->ugCoeff;
-        c->cgv = c->vgCoeff;
-    } else {
-        c->crv = c->ubCoeff;
-        c->cbu = c->vrCoeff;
-        c->cgu = c->vgCoeff;
-        c->cgv = c->ugCoeff;
-    }
-
-
-    if (masks == 555) {
-        c->rmask = 0x001f * 0x00010001U;
-        c->gmask = 0x03e0 * 0x00010001U;
-        c->bmask = 0x7c00 * 0x00010001U;
-    } else if (masks == 565) {
-        c->rmask = 0x001f * 0x00010001U;
-        c->gmask = 0x07e0 * 0x00010001U;
-        c->bmask = 0xf800 * 0x00010001U;
-    }
-}
-
-static int core_yuv420_rgb(SwsContext *c,
-                           uint8_t **in, int *instrides,
-                           int srcSliceY, int srcSliceH,
-                           uint8_t **oplanes, int *outstrides,
-                           ltransform lcscf, int rgb, int masks)
-{
-    uint8_t *py,*pu,*pv,*op;
-    int w  = instrides[0];
-    int h2 = srcSliceH>>1;
-    int i;
-
-    bfin_prepare_coefficients(c, rgb, masks);
-
-    py = in[0];
-    pu = in[1+(1^rgb)];
-    pv = in[1+(0^rgb)];
-
-    op = oplanes[0] + srcSliceY*outstrides[0];
-
-    for (i=0;i<h2;i++) {
-
-        lcscf(py, pu, pv, op, w, &c->oy);
-
-        py += instrides[0];
-        op += outstrides[0];
-
-        lcscf(py, pu, pv, op, w, &c->oy);
-
-        py += instrides[0];
-        pu += instrides[1];
-        pv += instrides[2];
-        op += outstrides[0];
-    }
-
-    return srcSliceH;
-}
-
-
-static int bfin_yuv420_rgb555(SwsContext *c,
-                              uint8_t **in, int *instrides,
-                              int srcSliceY, int srcSliceH,
-                              uint8_t **oplanes, int *outstrides)
-{
-    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
-                           outstrides, ff_bfin_yuv2rgb555_line, 1, 555);
-}
-
-static int bfin_yuv420_bgr555(SwsContext *c,
-                              uint8_t **in, int *instrides,
-                              int srcSliceY, int srcSliceH,
-                              uint8_t **oplanes, int *outstrides)
-{
-    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
-                           outstrides, ff_bfin_yuv2rgb555_line, 0, 555);
-}
-
-static int bfin_yuv420_rgb24(SwsContext *c,
-                             uint8_t **in, int *instrides,
-                             int srcSliceY, int srcSliceH,
-                             uint8_t **oplanes, int *outstrides)
-{
-    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
-                           outstrides, ff_bfin_yuv2rgb24_line, 1, 888);
-}
-
-static int bfin_yuv420_bgr24(SwsContext *c,
-                             uint8_t **in, int *instrides,
-                             int srcSliceY, int srcSliceH,
-                             uint8_t **oplanes, int *outstrides)
-{
-    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
-                           outstrides, ff_bfin_yuv2rgb24_line, 0, 888);
-}
-
-static int bfin_yuv420_rgb565(SwsContext *c,
-                              uint8_t **in, int *instrides,
-                              int srcSliceY, int srcSliceH,
-                              uint8_t **oplanes, int *outstrides)
-{
-    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
-                           outstrides, ff_bfin_yuv2rgb565_line, 1, 565);
-}
-
-static int bfin_yuv420_bgr565(SwsContext *c,
-                              uint8_t **in, int *instrides,
-                              int srcSliceY, int srcSliceH,
-                              uint8_t **oplanes, int *outstrides)
-{
-    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
-                           outstrides, ff_bfin_yuv2rgb565_line, 0, 565);
-}
-
-
-SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c)
-{
-    SwsFunc f;
-
-    switch(c->dstFormat) {
-    case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break;
-    case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break;
-    case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break;
-    case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break;
-    case PIX_FMT_RGB24:  f = bfin_yuv420_rgb24;  break;
-    case PIX_FMT_BGR24:  f = bfin_yuv420_bgr24;  break;
-    default:
-        return 0;
-    }
-
-    av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
-           sws_format_name (c->dstFormat));
-
-    return f;
-}
diff --git a/libswscale/yuv2rgb_mlib.c b/libswscale/yuv2rgb_mlib.c
deleted file mode 100644
index c8966b9fd5..0000000000
--- a/libswscale/yuv2rgb_mlib.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * software YUV to RGB converter using mediaLib
- *
- * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <mlib_types.h>
-#include <mlib_status.h>
-#include <mlib_sys.h>
-#include <mlib_video.h>
-#include <inttypes.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include "swscale.h"
-
-static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                               int srcSliceH, uint8_t* dst[], int dstStride[]){
-    if(c->srcFormat == PIX_FMT_YUV422P){
-        srcStride[1] *= 2;
-        srcStride[2] *= 2;
-    }
-
-    assert(srcStride[1] == srcStride[2]);
-
-    mlib_VideoColorYUV2ARGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
-                               srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
-    return srcSliceH;
-}
-
-static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                               int srcSliceH, uint8_t* dst[], int dstStride[]){
-    if(c->srcFormat == PIX_FMT_YUV422P){
-        srcStride[1] *= 2;
-        srcStride[2] *= 2;
-    }
-
-    assert(srcStride[1] == srcStride[2]);
-
-    mlib_VideoColorYUV2ABGR420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
-                               srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
-    return srcSliceH;
-}
-
-static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                              int srcSliceH, uint8_t* dst[], int dstStride[]){
-    if(c->srcFormat == PIX_FMT_YUV422P){
-        srcStride[1] *= 2;
-        srcStride[2] *= 2;
-    }
-
-    assert(srcStride[1] == srcStride[2]);
-
-    mlib_VideoColorYUV2RGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
-                              srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
-    return srcSliceH;
-}
-
-
-SwsFunc ff_yuv2rgb_init_mlib(SwsContext *c)
-{
-    switch(c->dstFormat){
-    case PIX_FMT_RGB24: return mlib_YUV2RGB420_24;
-    case PIX_FMT_BGR32: return mlib_YUV2ARGB420_32;
-    case PIX_FMT_RGB32: return mlib_YUV2ABGR420_32;
-    default: return NULL;
-    }
-}
-
diff --git a/libswscale/yuv2rgb_template.c b/libswscale/yuv2rgb_template.c
deleted file mode 100644
index 798eff0e33..0000000000
--- a/libswscale/yuv2rgb_template.c
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- * yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
- *
- * Copyright (C) 2000, Silicon Integrated System Corp
- *
- * Author: Olie Lho <ollie@sis.com.tw>
- *
- * 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
- * MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
- * context / deglobalize stuff by Michael Niedermayer
- *
- * This file is part of mpeg2dec, a free MPEG-2 video decoder
- *
- * mpeg2dec is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * mpeg2dec is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with mpeg2dec; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#undef MOVNTQ
-#undef EMMS
-#undef SFENCE
-
-#if HAVE_AMD3DNOW
-/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
-#define EMMS     "femms"
-#else
-#define EMMS     "emms"
-#endif
-
-#if HAVE_MMX2
-#define MOVNTQ "movntq"
-#define SFENCE "sfence"
-#else
-#define MOVNTQ "movq"
-#define SFENCE "/nop"
-#endif
-
-#define YUV2RGB \
-    /* Do the multiply part of the conversion for even and odd pixels,
-       register usage:
-       mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
-       mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
-       mm6 -> Y even, mm7 -> Y odd */\
-    /* convert the chroma part */\
-    "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
-    "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
-\
-    "psllw $3, %%mm0;" /* Promote precision */ \
-    "psllw $3, %%mm1;" /* Promote precision */ \
-\
-    "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \
-    "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \
-\
-    "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
-    "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
-\
-    "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
-    "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
-\
-    "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
-    "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
-\
-    "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
-\
-    /* convert the luma part */\
-    "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
-    "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
-\
-    "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\
-\
-    "psllw $3, %%mm6;" /* Promote precision */\
-    "psllw $3, %%mm7;" /* Promote precision */\
-\
-    "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\
-    "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\
-\
-    "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
-    "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
-\
-    /* Do the addition part of the conversion for even and odd pixels,
-       register usage:
-       mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
-       mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
-       mm6 -> Y even, mm7 -> Y odd */\
-    "movq %%mm0, %%mm3;" /* Copy Cblue */\
-    "movq %%mm1, %%mm4;" /* Copy Cred */\
-    "movq %%mm2, %%mm5;" /* Copy Cgreen */\
-\
-    "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\
-    "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\
-\
-    "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\
-    "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\
-\
-    "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\
-    "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\
-\
-    /* Limit RGB even to 0..255 */\
-    "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0  B6 B4 B2 B0 */\
-    "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0  R6 R4 R2 R0 */\
-    "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0  G6 G4 G2 G0 */\
-\
-    /* Limit RGB odd to 0..255 */\
-    "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1  B7 B5 B3 B1 */\
-    "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1  R7 R5 R3 R1 */\
-    "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1  G7 G5 G3 G1 */\
-\
-    /* Interleave RGB even and odd */\
-    "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\
-    "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\
-    "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\
-
-
-#define YUV422_UNSHIFT                   \
-    if(c->srcFormat == PIX_FMT_YUV422P){ \
-        srcStride[1] *= 2;               \
-        srcStride[2] *= 2;               \
-    }                                    \
-
-#define YUV2RGB_LOOP(depth)                                   \
-    h_size= (c->dstW+7)&~7;                                   \
-    if(h_size*depth > FFABS(dstStride[0])) h_size-=8;         \
-\
-    __asm__ volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );    \
-    for (y= 0; y<srcSliceH; y++ ) {                           \
-        uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0]; \
-        uint8_t *py = src[0] + y*srcStride[0];                \
-        uint8_t *pu = src[1] + (y>>1)*srcStride[1];           \
-        uint8_t *pv = src[2] + (y>>1)*srcStride[2];           \
-        x86_reg index= -h_size/2;                                \
-
-#define YUV2RGB_INIT                                                       \
-        /* This MMX assembly code deals with a SINGLE scan line at a time, \
-         * it converts 8 pixels in each iteration. */                      \
-        __asm__ volatile (                                                 \
-        /* load data for start of next scan line */                        \
-        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
-        "movd    (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
-        "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
-        /*                                                                 \
-        ".balign 16     \n\t"                                              \
-        */                                                                 \
-        "1:             \n\t"                                              \
-        /* No speed difference on my p3@500 with prefetch,                 \
-         * if it is faster for anyone with -benchmark then tell me.        \
-        PREFETCH" 64(%0) \n\t"                                             \
-        PREFETCH" 64(%1) \n\t"                                             \
-        PREFETCH" 64(%2) \n\t"                                             \
-        */                                                                 \
-
-#define YUV2RGB_ENDLOOP(depth) \
-        "add $"AV_STRINGIFY(depth*8)", %1    \n\t" \
-        "add                       $4, %0    \n\t" \
-        " js                       1b        \n\t" \
-
-#define YUV2RGB_OPERANDS \
-        : "+r" (index), "+r" (image) \
-        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index) \
-        ); \
-    } \
-    __asm__ volatile (EMMS); \
-    return srcSliceH; \
-
-#define YUV2RGB_OPERANDS_ALPHA \
-        : "+r" (index), "+r" (image) \
-        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index), "r" (pa - 2*index) \
-        ); \
-    } \
-    __asm__ volatile (EMMS); \
-    return srcSliceH; \
-
-static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
-    int y, h_size;
-
-    YUV422_UNSHIFT
-    YUV2RGB_LOOP(2)
-
-        c->blueDither= ff_dither8[y&1];
-        c->greenDither= ff_dither4[y&1];
-        c->redDither= ff_dither8[(y+1)&1];
-
-        YUV2RGB_INIT
-        YUV2RGB
-
-#ifdef DITHER1XBPP
-        "paddusb "BLUE_DITHER"(%4), %%mm0;"
-        "paddusb "GREEN_DITHER"(%4), %%mm2;"
-        "paddusb "RED_DITHER"(%4), %%mm1;"
-#endif
-        /* mask unneeded bits off */
-        "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
-        "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
-        "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
-
-        "psrlw   $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
-        "pxor %%mm4, %%mm4;" /* zero mm4 */
-
-        "movq %%mm0, %%mm5;" /* Copy B7-B0 */
-        "movq %%mm2, %%mm7;" /* Copy G7-G0 */
-
-        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
-        "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
-        "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
-
-        "psllw  $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
-        "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
-
-        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-        MOVNTQ "      %%mm0, (%1);" /* store pixel 0-3 */
-
-        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
-        "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
-        "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
-
-        "psllw        $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
-        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-
-        "por       %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
-        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-
-        MOVNTQ "   %%mm5, 8 (%1);" /* store pixel 4-7 */
-
-    YUV2RGB_ENDLOOP(2)
-    YUV2RGB_OPERANDS
-}
-
-static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
-    int y, h_size;
-
-    YUV422_UNSHIFT
-    YUV2RGB_LOOP(2)
-
-        c->blueDither= ff_dither8[y&1];
-        c->greenDither= ff_dither8[y&1];
-        c->redDither= ff_dither8[(y+1)&1];
-
-        YUV2RGB_INIT
-        YUV2RGB
-
-#ifdef DITHER1XBPP
-        "paddusb "BLUE_DITHER"(%4), %%mm0  \n\t"
-        "paddusb "GREEN_DITHER"(%4), %%mm2  \n\t"
-        "paddusb "RED_DITHER"(%4), %%mm1  \n\t"
-#endif
-
-        /* mask unneeded bits off */
-        "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
-        "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
-        "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
-
-        "psrlw   $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
-        "psrlw   $1, %%mm1;" /* 0_r7r6r5  r4r3_0_0 0_r7r6r5 r4r3_0_0 */
-        "pxor %%mm4, %%mm4;" /* zero mm4 */
-
-        "movq %%mm0, %%mm5;" /* Copy B7-B0 */
-        "movq %%mm2, %%mm7;" /* Copy G7-G0 */
-
-        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
-        "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
-        "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
-
-        "psllw  $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
-        "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
-
-        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-        MOVNTQ "      %%mm0, (%1);"  /* store pixel 0-3 */
-
-        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
-        "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
-        "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
-
-        "psllw        $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
-        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-
-        "por       %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
-        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-
-        MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
-
-    YUV2RGB_ENDLOOP(2)
-    YUV2RGB_OPERANDS
-}
-
-static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
-    int y, h_size;
-
-    YUV422_UNSHIFT
-    YUV2RGB_LOOP(3)
-
-        YUV2RGB_INIT
-        YUV2RGB
-        /* mm0=B, %%mm2=G, %%mm1=R */
-#if HAVE_MMX2
-        "movq "MANGLE(ff_M24A)", %%mm4     \n\t"
-        "movq "MANGLE(ff_M24C)", %%mm7     \n\t"
-        "pshufw $0x50, %%mm0, %%mm5     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */
-        "pshufw $0x50, %%mm2, %%mm3     \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */
-        "pshufw $0x00, %%mm1, %%mm6     \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */
-
-        "pand   %%mm4, %%mm5            \n\t" /*    B2        B1       B0 */
-        "pand   %%mm4, %%mm3            \n\t" /*    G2        G1       G0 */
-        "pand   %%mm7, %%mm6            \n\t" /*       R1        R0       */
-
-        "psllq     $8, %%mm3            \n\t" /* G2        G1       G0    */
-        "por    %%mm5, %%mm6            \n\t"
-        "por    %%mm3, %%mm6            \n\t"
-        MOVNTQ" %%mm6, (%1)             \n\t"
-
-        "psrlq     $8, %%mm2            \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */
-        "pshufw $0xA5, %%mm0, %%mm5     \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */
-        "pshufw $0x55, %%mm2, %%mm3     \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */
-        "pshufw $0xA5, %%mm1, %%mm6     \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */
-
-        "pand "MANGLE(ff_M24B)", %%mm5     \n\t" /* B5       B4        B3    */
-        "pand          %%mm7, %%mm3     \n\t" /*       G4        G3       */
-        "pand          %%mm4, %%mm6     \n\t" /*    R4        R3       R2 */
-
-        "por    %%mm5, %%mm3            \n\t" /* B5    G4 B4     G3 B3    */
-        "por    %%mm3, %%mm6            \n\t"
-        MOVNTQ" %%mm6, 8(%1)            \n\t"
-
-        "pshufw $0xFF, %%mm0, %%mm5     \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */
-        "pshufw $0xFA, %%mm2, %%mm3     \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */
-        "pshufw $0xFA, %%mm1, %%mm6     \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */
-        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-
-        "pand          %%mm7, %%mm5     \n\t" /*       B7        B6       */
-        "pand          %%mm4, %%mm3     \n\t" /*    G7        G6       G5 */
-        "pand "MANGLE(ff_M24B)", %%mm6     \n\t" /* R7       R6        R5    */
-        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-\
-        "por          %%mm5, %%mm3      \n\t"
-        "por          %%mm3, %%mm6      \n\t"
-        MOVNTQ"       %%mm6, 16(%1)     \n\t"
-        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-        "pxor         %%mm4, %%mm4      \n\t"
-
-#else
-
-        "pxor      %%mm4, %%mm4     \n\t"
-        "movq      %%mm0, %%mm5     \n\t" /* B */
-        "movq      %%mm1, %%mm6     \n\t" /* R */
-        "punpcklbw %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */
-        "punpcklbw %%mm4, %%mm1     \n\t" /* 0R0R0R0R 0 */
-        "punpckhbw %%mm2, %%mm5     \n\t" /* GBGBGBGB 2 */
-        "punpckhbw %%mm4, %%mm6     \n\t" /* 0R0R0R0R 2 */
-        "movq      %%mm0, %%mm7     \n\t" /* GBGBGBGB 0 */
-        "movq      %%mm5, %%mm3     \n\t" /* GBGBGBGB 2 */
-        "punpcklwd %%mm1, %%mm7     \n\t" /* 0RGB0RGB 0 */
-        "punpckhwd %%mm1, %%mm0     \n\t" /* 0RGB0RGB 1 */
-        "punpcklwd %%mm6, %%mm5     \n\t" /* 0RGB0RGB 2 */
-        "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */
-
-        "movq      %%mm7, %%mm2     \n\t" /* 0RGB0RGB 0 */
-        "movq      %%mm0, %%mm6     \n\t" /* 0RGB0RGB 1 */
-        "movq      %%mm5, %%mm1     \n\t" /* 0RGB0RGB 2 */
-        "movq      %%mm3, %%mm4     \n\t" /* 0RGB0RGB 3 */
-
-        "psllq       $40, %%mm7     \n\t" /* RGB00000 0 */
-        "psllq       $40, %%mm0     \n\t" /* RGB00000 1 */
-        "psllq       $40, %%mm5     \n\t" /* RGB00000 2 */
-        "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */
-
-        "punpckhdq %%mm2, %%mm7     \n\t" /* 0RGBRGB0 0 */
-        "punpckhdq %%mm6, %%mm0     \n\t" /* 0RGBRGB0 1 */
-        "punpckhdq %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */
-        "punpckhdq %%mm4, %%mm3     \n\t" /* 0RGBRGB0 3 */
-
-        "psrlq        $8, %%mm7     \n\t" /* 00RGBRGB 0 */
-        "movq      %%mm0, %%mm6     \n\t" /* 0RGBRGB0 1 */
-        "psllq       $40, %%mm0     \n\t" /* GB000000 1 */
-        "por       %%mm0, %%mm7     \n\t" /* GBRGBRGB 0 */
-        MOVNTQ"    %%mm7, (%1)      \n\t"
-
-        "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-
-        "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */
-        "movq      %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */
-        "psllq       $24, %%mm5     \n\t" /* BRGB0000 2 */
-        "por       %%mm5, %%mm6     \n\t" /* BRGBRGBR 1 */
-        MOVNTQ"    %%mm6, 8(%1)     \n\t"
-
-        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-
-        "psrlq       $40, %%mm1     \n\t" /* 000000RG 2 */
-        "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */
-        "por       %%mm3, %%mm1     \n\t" /* RGBRGBRG 2 */
-        MOVNTQ"    %%mm1, 16(%1)    \n\t"
-
-        "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-        "pxor      %%mm4, %%mm4     \n\t"
-#endif
-
-    YUV2RGB_ENDLOOP(3)
-    YUV2RGB_OPERANDS
-}
-
-#define RGB_PLANAR2PACKED32                                             \
-    /* convert RGB plane to RGB packed format,                          \
-       mm0 ->  B, mm1 -> R, mm2 -> G, mm3 -> A,                         \
-       mm4 -> GB, mm5 -> AR pixel 4-7,                                  \
-       mm6 -> GB, mm7 -> AR pixel 0-3 */                                \
-    "movq      %%mm0, %%mm6;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
-    "movq      %%mm1, %%mm7;"   /* R7 R6 R5 R4 R3 R2 R1 R0 */           \
-\
-    "movq      %%mm0, %%mm4;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
-    "movq      %%mm1, %%mm5;"   /* R7 R6 R5 R4 R3 R2 R1 R0 */           \
-\
-    "punpcklbw %%mm2, %%mm6;"   /* G3 B3 G2 B2 G1 B1 G0 B0 */           \
-    "punpcklbw %%mm3, %%mm7;"   /* A3 R3 A2 R2 A1 R1 A0 R0 */           \
-\
-    "punpcklwd %%mm7, %%mm6;"   /* A1 R1 B1 G1 A0 R0 B0 G0 */           \
-    MOVNTQ "   %%mm6, (%1);"    /* Store ARGB1 ARGB0 */                 \
-\
-    "movq      %%mm0, %%mm6;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
-    "punpcklbw %%mm2, %%mm6;"   /* G3 B3 G2 B2 G1 B1 G0 B0 */           \
-\
-    "punpckhwd %%mm7, %%mm6;"   /* A3 R3 G3 B3 A2 R2 B3 G2 */           \
-    MOVNTQ "   %%mm6, 8 (%1);"  /* Store ARGB3 ARGB2 */                 \
-\
-    "punpckhbw %%mm2, %%mm4;"   /* G7 B7 G6 B6 G5 B5 G4 B4 */           \
-    "punpckhbw %%mm3, %%mm5;"   /* A7 R7 A6 R6 A5 R5 A4 R4 */           \
-\
-    "punpcklwd %%mm5, %%mm4;"   /* A5 R5 B5 G5 A4 R4 B4 G4 */           \
-    MOVNTQ "   %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */                 \
-\
-    "movq      %%mm0, %%mm4;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
-    "punpckhbw %%mm2, %%mm4;"   /* G7 B7 G6 B6 G5 B5 G4 B4 */           \
-\
-    "punpckhwd %%mm5, %%mm4;"   /* A7 R7 G7 B7 A6 R6 B6 G6 */           \
-    MOVNTQ "   %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */                 \
-\
-    "movd 4 (%2, %0), %%mm0;"   /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
-    "movd 4 (%3, %0), %%mm1;"   /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
-\
-    "pxor         %%mm4, %%mm4;" /* zero mm4 */                         \
-    "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
-
-static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                                       int srcSliceH, uint8_t* dst[], int dstStride[]){
-    int y, h_size;
-
-    YUV422_UNSHIFT
-    YUV2RGB_LOOP(4)
-
-        YUV2RGB_INIT
-        YUV2RGB
-        "pcmpeqd   %%mm3, %%mm3;"   /* fill mm3 */
-        RGB_PLANAR2PACKED32
-
-    YUV2RGB_ENDLOOP(4)
-    YUV2RGB_OPERANDS
-}
-
-static inline int RENAME(yuva420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                                        int srcSliceH, uint8_t* dst[], int dstStride[]){
-#if HAVE_7REGS
-    int y, h_size;
-
-    YUV2RGB_LOOP(4)
-
-        uint8_t *pa = src[3] + y*srcStride[3];
-        YUV2RGB_INIT
-        YUV2RGB
-        "movq     (%6, %0, 2), %%mm3;"            /* Load 8 A A7 A6 A5 A4 A3 A2 A1 A0 */
-        RGB_PLANAR2PACKED32
-
-    YUV2RGB_ENDLOOP(4)
-    YUV2RGB_OPERANDS_ALPHA
-#endif
-}
diff --git a/libswscale/yuv2rgb_vis.c b/libswscale/yuv2rgb_vis.c
deleted file mode 100644
index fd5d0ba2f9..0000000000
--- a/libswscale/yuv2rgb_vis.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * VIS optimized software YUV to RGB converter
- * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <inttypes.h>
-#include <stdlib.h>
-
-#include "swscale.h"
-#include "swscale_internal.h"
-
-#define YUV2RGB_INIT \
-    "wr %%g0, 0x10, %%gsr \n\t" \
-    "ldd [%5], %%f32      \n\t" \
-    "ldd [%5+8], %%f34    \n\t" \
-    "ldd [%5+16], %%f36   \n\t" \
-    "ldd [%5+24], %%f38   \n\t" \
-    "ldd [%5+32], %%f40   \n\t" \
-    "ldd [%5+40], %%f42   \n\t" \
-    "ldd [%5+48], %%f44   \n\t" \
-    "ldd [%5+56], %%f46   \n\t" \
-    "ldd [%5+64], %%f48   \n\t" \
-    "ldd [%5+72], %%f50   \n\t"
-
-#define YUV2RGB_KERNEL \
-    /* ^^^^ f0=Y f3=u f5=v */ \
-    "fmul8x16 %%f3, %%f48, %%f6   \n\t" \
-    "fmul8x16 %%f19, %%f48, %%f22 \n\t" \
-    "fmul8x16 %%f5, %%f44, %%f8   \n\t" \
-    "fmul8x16 %%f21, %%f44, %%f24 \n\t" \
-    "fmul8x16 %%f0, %%f42, %%f0   \n\t" \
-    "fmul8x16 %%f16, %%f42, %%f16 \n\t" \
-    "fmul8x16 %%f3, %%f50, %%f2   \n\t" \
-    "fmul8x16 %%f19, %%f50, %%f18 \n\t" \
-    "fmul8x16 %%f5, %%f46, %%f4   \n\t" \
-    "fmul8x16 %%f21, %%f46, %%f20 \n\t" \
-    \
-    "fpsub16 %%f6, %%f34, %%f6   \n\t" /* 1 */ \
-    "fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */ \
-    "fpsub16 %%f8, %%f38, %%f8   \n\t" /* 3 */ \
-    "fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */ \
-    "fpsub16 %%f0, %%f32, %%f0   \n\t" /* 0 */ \
-    "fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */ \
-    "fpsub16 %%f2, %%f36, %%f2   \n\t" /* 2 */ \
-    "fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */ \
-    "fpsub16 %%f4, %%f40, %%f4   \n\t" /* 4 */ \
-    "fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */ \
-    \
-    "fpadd16 %%f0, %%f8, %%f8    \n\t" /* Gt */ \
-    "fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */ \
-    "fpadd16 %%f0, %%f4, %%f4    \n\t" /* R */ \
-    "fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */ \
-    "fpadd16 %%f0, %%f6, %%f6    \n\t" /* B */ \
-    "fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */ \
-    "fpadd16 %%f8, %%f2, %%f2    \n\t" /* G */ \
-    "fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */ \
-    \
-    "fpack16 %%f4, %%f4    \n\t" \
-    "fpack16 %%f20, %%f20  \n\t" \
-    "fpack16 %%f6, %%f6    \n\t" \
-    "fpack16 %%f22, %%f22  \n\t" \
-    "fpack16 %%f2, %%f2    \n\t" \
-    "fpack16 %%f18, %%f18  \n\t"
-
-
-
-// FIXME: must be changed to set alpha to 255 instead of 0
-static int vis_420P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                           int srcSliceH, uint8_t* dst[], int dstStride[]){
-  int y, out1, out2, out3, out4, out5, out6;
-
-  for(y=0;y < srcSliceH;++y) {
-      __asm__ volatile (
-          YUV2RGB_INIT
-          "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
-          "1:                          \n\t"
-          "ldda [%1] %%asi, %%f2       \n\t"
-          "ldda [%1+2] %%asi, %%f18    \n\t"
-          "ldda [%2] %%asi, %%f4       \n\t"
-          "ldda [%2+2] %%asi, %%f20    \n\t"
-          "ld [%0], %%f0               \n\t"
-          "ld [%0+4], %%f16            \n\t"
-          "fpmerge %%f3, %%f3, %%f2    \n\t"
-          "fpmerge %%f19, %%f19, %%f18 \n\t"
-          "fpmerge %%f5, %%f5, %%f4    \n\t"
-          "fpmerge %%f21, %%f21, %%f20 \n\t"
-          YUV2RGB_KERNEL
-          "fzero %%f0                  \n\t"
-          "fpmerge %%f4, %%f6, %%f8    \n\t"  // r,b,t1
-          "fpmerge %%f20, %%f22, %%f24 \n\t"  // r,b,t1
-          "fpmerge %%f0, %%f2, %%f10   \n\t"  // 0,g,t2
-          "fpmerge %%f0, %%f18, %%f26  \n\t"  // 0,g,t2
-          "fpmerge %%f10, %%f8, %%f4   \n\t"  // t2,t1,msb
-          "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2,t1,msb
-          "fpmerge %%f11, %%f9, %%f6   \n\t"  // t2,t1,lsb
-          "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2,t1,lsb
-          "std %%f4, [%3]              \n\t"
-          "std %%f20, [%3+16]          \n\t"
-          "std %%f6, [%3+8]            \n\t"
-          "std %%f22, [%3+24]          \n\t"
-
-          "add %0, 8, %0   \n\t"
-          "add %1, 4, %1   \n\t"
-          "add %2, 4, %2   \n\t"
-          "subcc %4, 8, %4 \n\t"
-          "bne 1b          \n\t"
-          "add %3, 32, %3  \n\t" //delay slot
-          : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
-          : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+((y+srcSliceY)>>1)*srcStride[1]),
-            "2" (src[2]+((y+srcSliceY)>>1)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]),
-            "4" (c->dstW),
-            "5" (c->sparc_coeffs)
-      );
-  }
-
-  return srcSliceH;
-}
-
-// FIXME: must be changed to set alpha to 255 instead of 0
-static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-                           int srcSliceH, uint8_t* dst[], int dstStride[]){
-  int y, out1, out2, out3, out4, out5, out6;
-
-  for(y=0;y < srcSliceH;++y) {
-      __asm__ volatile (
-          YUV2RGB_INIT
-          "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
-          "1:                          \n\t"
-          "ldda [%1] %%asi, %%f2       \n\t"
-          "ldda [%1+2] %%asi, %%f18    \n\t"
-          "ldda [%2] %%asi, %%f4       \n\t"
-          "ldda [%2+2] %%asi, %%f20    \n\t"
-          "ld [%0], %%f0               \n\t"
-          "ld [%0+4], %%f16            \n\t"
-          "fpmerge %%f3, %%f3, %%f2    \n\t"
-          "fpmerge %%f19, %%f19, %%f18 \n\t"
-          "fpmerge %%f5, %%f5, %%f4    \n\t"
-          "fpmerge %%f21, %%f21, %%f20 \n\t"
-          YUV2RGB_KERNEL
-          "fzero %%f0 \n\t"
-          "fpmerge %%f4, %%f6, %%f8    \n\t"  // r,b,t1
-          "fpmerge %%f20, %%f22, %%f24 \n\t"  // r,b,t1
-          "fpmerge %%f0, %%f2, %%f10   \n\t"  // 0,g,t2
-          "fpmerge %%f0, %%f18, %%f26  \n\t"  // 0,g,t2
-          "fpmerge %%f10, %%f8, %%f4   \n\t"  // t2,t1,msb
-          "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2,t1,msb
-          "fpmerge %%f11, %%f9, %%f6   \n\t"  // t2,t1,lsb
-          "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2,t1,lsb
-          "std %%f4, [%3]              \n\t"
-          "std %%f20, [%3+16]          \n\t"
-          "std %%f6, [%3+8]            \n\t"
-          "std %%f22, [%3+24]          \n\t"
-
-          "add %0, 8, %0   \n\t"
-          "add %1, 4, %1   \n\t"
-          "add %2, 4, %2   \n\t"
-          "subcc %4, 8, %4 \n\t"
-          "bne 1b          \n\t"
-          "add %3, 32, %3  \n\t" //delay slot
-          : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
-          : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+(y+srcSliceY)*srcStride[1]),
-            "2" (src[2]+(y+srcSliceY)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]),
-            "4" (c->dstW),
-            "5" (c->sparc_coeffs)
-      );
-  }
-
-  return srcSliceH;
-}
-
-SwsFunc ff_yuv2rgb_init_vis(SwsContext *c){
-    c->sparc_coeffs[5]=c->yCoeff;
-    c->sparc_coeffs[6]=c->vgCoeff;
-    c->sparc_coeffs[7]=c->vrCoeff;
-    c->sparc_coeffs[8]=c->ubCoeff;
-    c->sparc_coeffs[9]=c->ugCoeff;
-
-    c->sparc_coeffs[0]=(((int16_t)c->yOffset*(int16_t)c->yCoeff >>11) & 0xffff) * 0x0001000100010001ULL;
-    c->sparc_coeffs[1]=(((int16_t)c->uOffset*(int16_t)c->ubCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
-    c->sparc_coeffs[2]=(((int16_t)c->uOffset*(int16_t)c->ugCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
-    c->sparc_coeffs[3]=(((int16_t)c->vOffset*(int16_t)c->vgCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
-    c->sparc_coeffs[4]=(((int16_t)c->vOffset*(int16_t)c->vrCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
-
-    if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV422P && (c->dstW & 7)==0) {
-        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV422P -> RGB32 (WARNING: alpha value is wrong)\n");
-        return vis_422P_ARGB32;
-    }
-    else if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV420P && (c->dstW & 7)==0) {
-        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV420P -> RGB32 (WARNING: alpha value is wrong)\n");
-        return vis_420P_ARGB32;
-    }
-    return NULL;
-}
-- 
cgit v1.2.3