8 files changed, 617 insertions, 808 deletions
diff --git a/postproc/rgb2rgb.h b/postproc/rgb2rgb.h
index 1a7720c7aa..c74a197bfa 100644
--- a/postproc/rgb2rgb.h
+++ b/postproc/rgb2rgb.h
@@ -84,13 +84,15 @@ extern void yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t
 #define MODE_RGB  0x1
 #define MODE_BGR  0x2
 
-typedef void (* yuv2rgb_fun) (uint8_t * image, uint8_t * py,
+static void yuv2rgb(uint8_t * image, uint8_t * py,
 			      uint8_t * pu, uint8_t * pv,
 			      unsigned h_size, unsigned v_size,
-			      int rgb_stride, int y_stride, int uv_stride);
+			      int rgb_stride, int y_stride, int uv_stride){
+printf("broken, this should use the swscaler\n");
+}
 
-extern yuv2rgb_fun yuv2rgb;
-
-void yuv2rgb_init (unsigned bpp, int mode);
+static void yuv2rgb_init (unsigned bpp, int mode){
+printf("broken, this should use the swscaler\n");
+}
 
 #endif
diff --git a/postproc/swscale.c b/postproc/swscale.c
index b4febf948f..dbbf0726e8 100644
--- a/postproc/swscale.c
+++ b/postproc/swscale.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
+    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -62,6 +62,7 @@ untested special converters
 #include <stdlib.h>
 #endif
 #include "swscale.h"
+#include "swscale_internal.h"
 #include "../cpudetect.h"
 #include "../bswap.h"
 #include "../libvo/img_format.h"
@@ -147,7 +148,6 @@ add support for Y8 output
 optimize bgr24 & bgr32
 add BGR4 output support
 write special BGR->BGR scaler
-deglobalize yuv2rgb*.c
 */
 
 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
@@ -230,8 +230,6 @@ void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSli
              int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
 
 static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
-static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]);
-void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_gU[256], int table_gV[256], void *table_bU[256]);
 
 extern const uint8_t dither_2x2_4[2][8];
 extern const uint8_t dither_2x2_8[2][8];
@@ -1634,18 +1632,6 @@ static void PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[],
 		interleaveBytes( src[2],src[1],dst,c->srcW,srcSliceH,srcStride[2],srcStride[1],dstStride[0] );
 }
 
-
-/* Warper functions for yuv2bgr */
-static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
-             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
-	uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
-
-	if(c->srcFormat==IMGFMT_YV12)
-		yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
-	else /* I420 & IYUV */
-		yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
-}
-
 static void PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
 	uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
@@ -1773,7 +1759,7 @@ static void yvu9toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], in
 /**
  * bring pointers in YUV order instead of YVU
  */
-static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
+inline void sws_orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
 	if(format == IMGFMT_YV12 || format == IMGFMT_YVU9 
            || format == IMGFMT_444P || format == IMGFMT_422P || format == IMGFMT_411P){
 		sortedP[0]= p[0];
@@ -1814,8 +1800,8 @@ static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[],
 	uint8_t *src[3];
 	uint8_t *dst[3];
 
-	orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
-	orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
+	sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+	sws_orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
 
 	if(isPacked(c->srcFormat))
 	{
@@ -1923,41 +1909,51 @@ static void getSubSampleFactors(int *h, int *v, int format){
 	}
 }
 
-static uint16_t roundToInt16(float f){
-	     if(f<-0x7FFF) f= -0x7FFF;
-	else if(f> 0x7FFF) f=  0x7FFF;
-	
-	return (int)floor(f + 0.5);
+static uint16_t roundToInt16(int64_t f){
+	int r= (f + (1<<15))>>16;
+	     if(r<-0x7FFF) return 0x8000;
+	else if(r> 0x7FFF) return 0x7FFF;
+	else               return r;
 }
 
 /**
- * @param colorspace colorspace
+ * @param inv_table the yuv2rgb coeffs, normally Inverse_Table_6_9[x]
  * @param fullRange if 1 then the luma range is 0..255 if 0 its 16..235
+ * @return -1 if not supported
  */
-void setInputColorspaceDetails(SwsContext *c, int colorspace, int fullRange, float brightness, float contrast, float saturation){
-
-	float crv =  Inverse_Table_6_9[colorspace][0]/65536.0;
-	float cbu =  Inverse_Table_6_9[colorspace][1]/65536.0;
-	float cgu = -Inverse_Table_6_9[colorspace][2]/65536.0;
-	float cgv = -Inverse_Table_6_9[colorspace][3]/65536.0;
-	float cy  = 1.0;
-	float oy  = 0;
+int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation){
+	int64_t crv =  inv_table[0];
+	int64_t cbu =  inv_table[1];
+	int64_t cgu = -inv_table[2];
+	int64_t cgv = -inv_table[3];
+	int64_t cy  = 1<<16;
+	int64_t oy  = 0;
+
+	if(isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
+	memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4);
+	memcpy(c->dstColorspaceTable,     table, sizeof(int)*4);
+
+	c->brightness= brightness;
+	c->contrast  = contrast;
+	c->saturation= saturation;
+	c->srcRange  = srcRange;
+	c->dstRange  = dstRange;
 
 	c->uOffset=   0x0400040004000400LL;
 	c->vOffset=   0x0400040004000400LL;
 
-	if(!fullRange){
-		cy= (cy*255.0) / 219.0;
-		oy= 16.0;
+	if(!srcRange){
+		cy= (cy*255) / 219;
+		oy= 16<<16;
 	}
 
-	cy *= contrast;
-	crv*= contrast * saturation;
-	cbu*= contrast * saturation;
-	cgu*= contrast * saturation;
-	cgv*= contrast * saturation;
+	cy = (cy *contrast             )>>16;
+	crv= (crv*contrast * saturation)>>32;
+	cbu= (cbu*contrast * saturation)>>32;
+	cgu= (cgu*contrast * saturation)>>32;
+	cgv= (cgv*contrast * saturation)>>32;
 
-	oy -= 256.0*brightness;
+	oy -= 256*brightness;
 
 	c->yCoeff=    roundToInt16(cy *8192) * 0x0001000100010001ULL;
 	c->vrCoeff=   roundToInt16(crv*8192) * 0x0001000100010001ULL;
@@ -1965,6 +1961,28 @@ void setInputColorspaceDetails(SwsContext *c, int colorspace, int fullRange, flo
 	c->vgCoeff=   roundToInt16(cgv*8192) * 0x0001000100010001ULL;
 	c->ugCoeff=   roundToInt16(cgu*8192) * 0x0001000100010001ULL;
 	c->yOffset=   roundToInt16(oy *   8) * 0x0001000100010001ULL;
+
+	yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
+	//FIXME factorize
+	
+	return 0;
+}
+
+/**
+ * @return -1 if not supported
+ */
+int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation){
+	if(isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
+
+	*inv_table = c->srcColorspaceTable;
+	*table     = c->dstColorspaceTable;
+	*srcRange  = c->srcRange;
+	*dstRange  = c->dstRange;
+	*brightness= c->brightness;
+	*contrast  = c->contrast;
+	*saturation= c->saturation;
+	
+	return 0;	
 }
 
 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
@@ -2026,8 +2044,6 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 	c->dstFormat= dstFormat;
 	c->srcFormat= srcFormat;
 
-	setInputColorspaceDetails(c, SWS_CS_DEFAULT, 0, 0.0, 1.0, 1.0);
-	
 	usesFilter=0;
 	if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
 	if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
@@ -2054,17 +2070,14 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 
 	c->chrIntHSubSample= c->chrDstHSubSample;
 	c->chrIntVSubSample= c->chrSrcVSubSample;
-	
+
 	// note the -((-x)>>y) is so that we allways round toward +inf
 	c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
 	c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
 	c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
 	c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
-	
-	if(isBGR(dstFormat))
-		c->yuvTable= yuv2rgb_c_init(dstFormat & 0xFF, MODE_RGB, c->table_rV, c->table_gU, c->table_gV, c->table_bU);
-	if(isRGB(dstFormat))
-		c->yuvTable= yuv2rgb_c_init(dstFormat & 0xFF, MODE_BGR, c->table_rV, c->table_gU, c->table_gV, c->table_bU);
+
+	sws_setColorspaceDetails(c, Inverse_Table_6_9[SWS_CS_DEFAULT], 0, Inverse_Table_6_9[SWS_CS_DEFAULT] /* FIXME*/, 0, 0, 1<<16, 1<<16); 
 
 	/* unscaled special Cases */
 	if(unscaled && !usesFilter)
@@ -2075,19 +2088,9 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			c->swScale= PlanarToNV12Wrapper;
 		}
 		/* yuv2bgr */
-		if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_I420) && isBGR(dstFormat))
+		if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_I420 || srcFormat==IMGFMT_422P) && (isBGR(dstFormat) || isRGB(dstFormat)))
 		{
-			// FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
-			//FIXME rgb vs. bgr ? 
-#ifdef WORDS_BIGENDIAN
-			if(dstFormat==IMGFMT_BGR32)
-				yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
-			else
-				yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
-#else
-			yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
-#endif
-			c->swScale= planarYuvToBgr;
+			c->swScale= yuv2rgb_get_func_ptr(c);
 		}
 		
 		if( srcFormat==IMGFMT_YVU9 && (dstFormat==IMGFMT_YV12 || dstFormat==IMGFMT_I420) )
diff --git a/postproc/swscale.h b/postproc/swscale.h
index fd1539b1e6..0471d6a998 100644
--- a/postproc/swscale.h
+++ b/postproc/swscale.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
+    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -16,6 +16,11 @@
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
+#ifndef SWSCALE_H
+#define SWSCALE_H
+
+#include "swscale_internal.h" //FIXME HACK REMOVE
+
 /* values for the flags, the stuff on the command line is different */
 #define SWS_FAST_BILINEAR 1
 #define SWS_BILINEAR 2
@@ -44,8 +49,6 @@
 #define SWS_FULL_CHR_H_INP	0x4000
 #define SWS_DIRECT_BGR		0x8000
 
-#define MAX_FILTER_SIZE 256
-
 #define SWS_MAX_REDUCE_CUTOFF 0.002
 
 #define SWS_CS_ITU709		1
@@ -56,97 +59,6 @@
 #define SWS_CS_SMPTE240M 	7
 #define SWS_CS_DEFAULT 		5
 
-/* this struct should be aligned on at least 32-byte boundary */
-typedef struct SwsContext{
-	int srcW, srcH, dstH;
-	int chrSrcW, chrSrcH, chrDstW, chrDstH;
-	int lumXInc, chrXInc;
-	int lumYInc, chrYInc;
-	int dstFormat, srcFormat;
-	int chrSrcHSubSample, chrSrcVSubSample;
-	int chrIntHSubSample, chrIntVSubSample;
-	int chrDstHSubSample, chrDstVSubSample;
-	int vChrDrop;
-
-	int16_t **lumPixBuf;
-	int16_t **chrPixBuf;
-	int16_t *hLumFilter;
-	int16_t *hLumFilterPos;
-	int16_t *hChrFilter;
-	int16_t *hChrFilterPos;
-	int16_t *vLumFilter;
-	int16_t *vLumFilterPos;
-	int16_t *vChrFilter;
-	int16_t *vChrFilterPos;
-
-	uint8_t formatConvBuffer[4000]; //FIXME dynamic alloc, but we have to change alot of code for this to be usefull
-
-	int hLumFilterSize;
-	int hChrFilterSize;
-	int vLumFilterSize;
-	int vChrFilterSize;
-	int vLumBufSize;
-	int vChrBufSize;
-
-	uint8_t __attribute__((aligned(32))) funnyYCode[10000];
-	uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
-	int32_t *lumMmx2FilterPos;
-	int32_t *chrMmx2FilterPos;
-	int16_t *lumMmx2Filter;
-	int16_t *chrMmx2Filter;
-
-	int canMMX2BeUsed;
-
-	int lastInLumBuf;
-	int lastInChrBuf;
-	int lumBufIndex;
-	int chrBufIndex;
-	int dstY;
-	int flags;
-	void * yuvTable;
-	void * table_rV[256];
-	void * table_gU[256];
-	int    table_gV[256];
-	void * table_bU[256];
-
-	void (*swScale)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
-             int srcSliceH, uint8_t* dst[], int dstStride[]);
-
-#define RED_DITHER   "0*8"
-#define GREEN_DITHER "1*8"
-#define BLUE_DITHER  "2*8"
-#define Y_COEFF      "3*8"
-#define VR_COEFF     "4*8"
-#define UB_COEFF     "5*8"
-#define VG_COEFF     "6*8"
-#define UG_COEFF     "7*8"
-#define Y_OFFSET     "8*8"
-#define U_OFFSET     "9*8"
-#define V_OFFSET     "10*8"
-#define LUM_MMX_FILTER_OFFSET "11*8"
-#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
-#define DSTW_OFFSET  "11*8+4*4*256*2"
-#define ESP_OFFSET  "11*8+4*4*256*2+4"
-                  
-	uint64_t redDither   __attribute__((aligned(8)));
-	uint64_t greenDither __attribute__((aligned(8)));
-	uint64_t blueDither  __attribute__((aligned(8)));
-
-	uint64_t yCoeff      __attribute__((aligned(8)));
-	uint64_t vrCoeff     __attribute__((aligned(8)));
-	uint64_t ubCoeff     __attribute__((aligned(8)));
-	uint64_t vgCoeff     __attribute__((aligned(8)));
-	uint64_t ugCoeff     __attribute__((aligned(8)));
-	uint64_t yOffset     __attribute__((aligned(8)));
-	uint64_t uOffset     __attribute__((aligned(8)));
-	uint64_t vOffset     __attribute__((aligned(8)));
-	int32_t  lumMmxFilter[4*MAX_FILTER_SIZE];
-	int32_t  chrMmxFilter[4*MAX_FILTER_SIZE];
-	int dstW;
-	int esp;
-} SwsContext;
-//FIXME check init (where 0)
-//FIXME split private & public
 
 
 // when used for filters they must have an odd number of elements
@@ -185,6 +97,9 @@ SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH,
 			 SwsFilter *srcFilter, SwsFilter *dstFilter);
 void swsGetFlagsAndFilterFromCmdLine(int *flags, SwsFilter **srcFilterParam, SwsFilter **dstFilterParam);
 
+int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation);
+int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation);
+
 SwsVector *getGaussianVec(double variance, double quality);
 SwsVector *getConstVec(double c, int length);
 SwsVector *getIdentityVec(void);
@@ -199,3 +114,4 @@ SwsVector *cloneVec(SwsVector *a);
 void printVec(SwsVector *a);
 void freeVec(SwsVector *a);
 
+#endif
diff --git a/postproc/swscale_internal.h b/postproc/swscale_internal.h
new file mode 100644
index 0000000000..0697d97e62
--- /dev/null
+++ b/postproc/swscale_internal.h
@@ -0,0 +1,130 @@
+/*
+    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#ifndef SWSCALE_INTERNAL_H
+#define SWSCALE_INTERNAL_H
+
+#define MAX_FILTER_SIZE 256
+
+struct SwsContext;
+
+typedef void (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
+             int srcSliceH, uint8_t* dst[], int dstStride[]);
+
+/* this struct should be aligned on at least 32-byte boundary */
+typedef struct SwsContext{
+	int srcW, srcH, dstH;
+	int chrSrcW, chrSrcH, chrDstW, chrDstH;
+	int lumXInc, chrXInc;
+	int lumYInc, chrYInc;
+	int dstFormat, srcFormat;
+	int chrSrcHSubSample, chrSrcVSubSample;
+	int chrIntHSubSample, chrIntVSubSample;
+	int chrDstHSubSample, chrDstVSubSample;
+	int vChrDrop;
+
+	int16_t **lumPixBuf;
+	int16_t **chrPixBuf;
+	int16_t *hLumFilter;
+	int16_t *hLumFilterPos;
+	int16_t *hChrFilter;
+	int16_t *hChrFilterPos;
+	int16_t *vLumFilter;
+	int16_t *vLumFilterPos;
+	int16_t *vChrFilter;
+	int16_t *vChrFilterPos;
+
+	uint8_t formatConvBuffer[4000]; //FIXME dynamic alloc, but we have to change alot of code for this to be usefull
+
+	int hLumFilterSize;
+	int hChrFilterSize;
+	int vLumFilterSize;
+	int vChrFilterSize;
+	int vLumBufSize;
+	int vChrBufSize;
+
+	uint8_t __attribute__((aligned(32))) funnyYCode[10000];
+	uint8_t __attribute__((aligned(32))) funnyUVCode[10000];
+	int32_t *lumMmx2FilterPos;
+	int32_t *chrMmx2FilterPos;
+	int16_t *lumMmx2Filter;
+	int16_t *chrMmx2Filter;
+
+	int canMMX2BeUsed;
+
+	int lastInLumBuf;
+	int lastInChrBuf;
+	int lumBufIndex;
+	int chrBufIndex;
+	int dstY;
+	int flags;
+	void * yuvTable;			// pointer to the yuv->rgb table start so it can be freed()
+	void * table_rV[256];
+	void * table_gU[256];
+	int    table_gV[256];
+	void * table_bU[256];
+
+	//Colorspace stuff
+	int contrast, brightness, saturation;	// for sws_getColorspaceDetails
+	int srcColorspaceTable[4];
+	int dstColorspaceTable[4];
+	int srcRange, dstRange;
+
+	SwsFunc swScale;
+
+#define RED_DITHER   "0*8"
+#define GREEN_DITHER "1*8"
+#define BLUE_DITHER  "2*8"
+#define Y_COEFF      "3*8"
+#define VR_COEFF     "4*8"
+#define UB_COEFF     "5*8"
+#define VG_COEFF     "6*8"
+#define UG_COEFF     "7*8"
+#define Y_OFFSET     "8*8"
+#define U_OFFSET     "9*8"
+#define V_OFFSET     "10*8"
+#define LUM_MMX_FILTER_OFFSET "11*8"
+#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
+#define DSTW_OFFSET  "11*8+4*4*256*2"
+#define ESP_OFFSET  "11*8+4*4*256*2+4"
+                  
+	uint64_t redDither   __attribute__((aligned(8)));
+	uint64_t greenDither __attribute__((aligned(8)));
+	uint64_t blueDither  __attribute__((aligned(8)));
+
+	uint64_t yCoeff      __attribute__((aligned(8)));
+	uint64_t vrCoeff     __attribute__((aligned(8)));
+	uint64_t ubCoeff     __attribute__((aligned(8)));
+	uint64_t vgCoeff     __attribute__((aligned(8)));
+	uint64_t ugCoeff     __attribute__((aligned(8)));
+	uint64_t yOffset     __attribute__((aligned(8)));
+	uint64_t uOffset     __attribute__((aligned(8)));
+	uint64_t vOffset     __attribute__((aligned(8)));
+	int32_t  lumMmxFilter[4*MAX_FILTER_SIZE];
+	int32_t  chrMmxFilter[4*MAX_FILTER_SIZE];
+	int dstW;
+	int esp;
+} SwsContext;
+//FIXME check init (where 0)
+//FIXME split private & public
+
+inline void sws_orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]);
+SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
+int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
+
+#endif
diff --git a/postproc/swscale_template.c b/postproc/swscale_template.c
index b108fb1529..4166e50203 100644
--- a/postproc/swscale_template.c
+++ b/postproc/swscale_template.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
+    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -741,7 +741,6 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t *
 				    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 				    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
 {
-	int dummy=0;
 #ifdef HAVE_MMX
 	if(uDest != NULL)
 	{
@@ -2553,8 +2552,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
 	uint8_t *src[3];
 	uint8_t *dst[3];
 	
-	orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
-	orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
+	sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+	sws_orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
 
 	if(isPacked(c->srcFormat)){
 		src[0]=
diff --git a/postproc/yuv2rgb.c b/postproc/yuv2rgb.c
index 658d8a37db..94196c4c2d 100644
--- a/postproc/yuv2rgb.c
+++ b/postproc/yuv2rgb.c
@@ -27,18 +27,23 @@
  *
  * MMX/MMX2 Template stuff from Michael Niedermayer (michaelni@gmx.at) (needed for fast movntq support)
  * 1,4,8bpp support by Michael Niedermayer (michaelni@gmx.at)
+ * context / deglobalize stuff by Michael Niedermayer
  */
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <inttypes.h>
+#include <assert.h>
 
 #include "config.h"
 //#include "video_out.h"
 #include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
 #include "../cpudetect.h"
 #include "../mangle.h"
 #include "../mp_msg.h"
+#include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
 
 #ifdef HAVE_MLIB
 #include "yuv2rgb_mlib.c"
@@ -46,10 +51,6 @@
 
 #define DITHER1XBPP // only for mmx
 
-#ifdef ARCH_X86
-#define CAN_COMPILE_X86_ASM
-#endif
-
 const uint8_t  __attribute__((aligned(8))) dither_2x2_4[2][8]={
 {  1,   3,   1,   3,   1,   3,   1,   3, },
 {  2,   0,   2,   0,   2,   0,   2,   0, },
@@ -157,21 +158,10 @@ const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
 };
 #endif
 
-#ifdef CAN_COMPILE_X86_ASM
+#ifdef ARCH_X86
 
 /* hope these constant values are cache line aligned */
-uint64_t __attribute__((aligned(8))) mmx_80w = 0x0080008000800080;
-uint64_t __attribute__((aligned(8))) mmx_10w = 0x1010101010101010;
 uint64_t __attribute__((aligned(8))) mmx_00ffw = 0x00ff00ff00ff00ff;
-uint64_t __attribute__((aligned(8))) mmx_Y_coeff = 0x253f253f253f253f;
-
-/* hope these constant values are cache line aligned */
-uint64_t __attribute__((aligned(8))) mmx_U_green = 0xf37df37df37df37d;
-uint64_t __attribute__((aligned(8))) mmx_U_blue = 0x4093409340934093;
-uint64_t __attribute__((aligned(8))) mmx_V_red = 0x3312331233123312;
-uint64_t __attribute__((aligned(8))) mmx_V_green = 0xe5fce5fce5fce5fc;
-
-/* hope these constant values are cache line aligned */
 uint64_t __attribute__((aligned(8))) mmx_redmask = 0xf8f8f8f8f8f8f8f8;
 uint64_t __attribute__((aligned(8))) mmx_grnmask = 0xfcfcfcfcfcfcfcfc;
 
@@ -217,8 +207,6 @@ uint64_t __attribute__((aligned(8))) dither8[2]={
 
 #endif // CAN_COMPILE_X86_ASM
 
-uint32_t matrix_coefficients = 6;
-
 const int32_t Inverse_Table_6_9[8][4] = {
     {117504, 138453, 13954, 34903}, /* no sequence_display_extension */
     {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
@@ -230,82 +218,12 @@ const int32_t Inverse_Table_6_9[8][4] = {
     {117579, 136230, 16907, 35559}  /* SMPTE 240M (1987) */
 };
 
-void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_gU[256], int table_gV[256], void *table_bU[256]);
-
-yuv2rgb_fun yuv2rgb= NULL;
-
-static void (* yuv2rgb_c_internal) (uint8_t *, uint8_t *,
-				    uint8_t *, uint8_t *,
-				    void *, void *, int, int);
-
-static void yuv2rgb_c (void * dst, uint8_t * py,
-		       uint8_t * pu, uint8_t * pv,
-		       unsigned h_size, unsigned v_size,
-		       unsigned rgb_stride, unsigned y_stride, unsigned uv_stride)
-{
-    v_size >>= 1;
-
-    while (v_size--) {
-	yuv2rgb_c_internal (py, py + y_stride, pu, pv, dst, dst + rgb_stride,
-			    h_size, v_size<<1);
-
-	py += 2 * y_stride;
-	pu += uv_stride;
-	pv += uv_stride;
-	dst += 2 * rgb_stride;
-    }
-}
-
-void * table_rV[256];
-void * table_gU[256];
-int table_gV[256];
-void * table_bU[256];
-
-void yuv2rgb_init (unsigned bpp, int mode)
-{
-    if(yuv2rgb) return;
-#ifdef CAN_COMPILE_X86_ASM
-    if(gCpuCaps.hasMMX2)
-    {
-	if (yuv2rgb == NULL /*&& (config.flags & VO_MMX_ENABLE)*/) {
-		yuv2rgb = yuv2rgb_init_MMX2 (bpp, mode);
-		if (yuv2rgb != NULL)
-			mp_msg(MSGT_SWS,MSGL_INFO,"Using MMX2 for colorspace transform\n");
-		else
-			mp_msg(MSGT_SWS,MSGL_WARN,"Cannot init MMX2 colorspace transform\n");
-	}
-    }
-    else if(gCpuCaps.hasMMX)
-    {
-	if (yuv2rgb == NULL /*&& (config.flags & VO_MMX_ENABLE)*/) {
-		yuv2rgb = yuv2rgb_init_MMX (bpp, mode);
-		if (yuv2rgb != NULL)
-			mp_msg(MSGT_SWS,MSGL_INFO,"Using MMX for colorspace transform\n");
-		else
-			mp_msg(MSGT_SWS,MSGL_WARN,"Cannot init MMX colorspace transform\n");
-	}
-    }
-#endif
-#ifdef HAVE_MLIB
-    if (yuv2rgb == NULL /*&& (config.flags & VO_MLIB_ENABLE)*/) {
-	yuv2rgb = yuv2rgb_init_mlib (bpp, mode);
-	if (yuv2rgb != NULL)
-	    mp_msg(MSGT_SWS,MSGL_INFO,"Using mlib for colorspace transform\n");
-    }
-#endif
-    if (yuv2rgb == NULL) {
-	mp_msg(MSGT_SWS,MSGL_INFO,"No accelerated colorspace conversion found\n");
-	yuv2rgb_c_init (bpp, mode, table_rV, table_gU, table_gV, table_bU);
-	yuv2rgb = (yuv2rgb_fun)yuv2rgb_c;
-    }
-}
-
 #define RGB(i)					\
 	U = pu[i];				\
 	V = pv[i];				\
-	r = table_rV[V];			\
-	g = table_gU[U] + table_gV[V];		\
-	b = table_bU[U];
+	r = c->table_rV[V];			\
+	g = c->table_gU[U] + c->table_gV[V];		\
+	b = c->table_bU[U];
 
 #define DST1(i)					\
 	Y = py_1[2*i];				\
@@ -343,19 +261,42 @@ void yuv2rgb_init (unsigned bpp, int mode)
 	Y = py_2[2*i+1];						\
 	dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y];
 
-static void yuv2rgb_c_32 (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint32_t * r, * g, * b;
-    uint32_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
+#define PROLOG(func_name, dst_type) \
+static void func_name(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY, \
+             int srcSliceH, uint8_t* dst[], int dstStride[]){\
+    uint8_t *src[3];\
+    int srcStride[3];\
+    int y;\
+\
+    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);\
+    if(c->srcFormat == IMGFMT_422P){\
+	srcStride[1] *= 2;\
+	srcStride[2] *= 2;\
+    }\
+    for(y=0; y<srcSliceH; y+=2){\
+	dst_type *dst_1= (dst_type*)(dst[0] + (y+srcSliceY  )*dstStride[0]);\
+	dst_type *dst_2= (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\
+	dst_type *r, *g, *b;\
+	uint8_t *py_1= src[0] + y*srcStride[0];\
+	uint8_t *py_2= py_1 + srcStride[0];\
+	uint8_t *pu= src[1] + (y>>1)*srcStride[1];\
+	uint8_t *pv= src[2] + (y>>1)*srcStride[2];\
+	unsigned int h_size= c->dstW>>3;\
+	while (h_size--) {\
+	    int U, V, Y;\
+
+#define EPILOG(dst_delta)\
+	    pu += 4;\
+	    pv += 4;\
+	    py_1 += 8;\
+	    py_2 += 8;\
+	    dst_1 += dst_delta;\
+	    dst_2 += dst_delta;\
+	}\
+    }\
+}
 
-    while (h_size--) {
+PROLOG(yuv2rgb_c_32, uint32_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -371,30 +312,9 @@ static void yuv2rgb_c_32 (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);
+EPILOG(8)
 
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 8;
-	dst_2 += 8;
-    }
-}
-
-// This is very near from the yuv2rgb_c_32 code
-static void yuv2rgb_c_24_rgb (uint8_t * py_1, uint8_t * py_2,
-			      uint8_t * pu, uint8_t * pv,
-			      void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_24_rgb, uint8_t)
 	RGB(0);
 	DST1RGB(0);
 	DST2RGB(0);
@@ -410,30 +330,10 @@ static void yuv2rgb_c_24_rgb (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2RGB(3);
 	DST1RGB(3);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 24;
-	dst_2 += 24;
-    }
-}
+EPILOG(24)
 
 // only trivial mods from yuv2rgb_c_24_rgb
-static void yuv2rgb_c_24_bgr (uint8_t * py_1, uint8_t * py_2,
-			      uint8_t * pu, uint8_t * pv,
-			      void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_24_bgr, uint8_t)
 	RGB(0);
 	DST1BGR(0);
 	DST2BGR(0);
@@ -449,31 +349,11 @@ static void yuv2rgb_c_24_bgr (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2BGR(3);
 	DST1BGR(3);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 24;
-	dst_2 += 24;
-    }
-}
+EPILOG(24)
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-static void yuv2rgb_c_16 (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint16_t * r, * g, * b;
-    uint16_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_16, uint16_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -489,31 +369,11 @@ static void yuv2rgb_c_16 (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 8;
-	dst_2 += 8;
-    }
-}
+EPILOG(8)
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-static void yuv2rgb_c_8  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_8, uint8_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -529,32 +389,12 @@ static void yuv2rgb_c_8  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 8;
-	dst_2 += 8;
-    }
-}
+EPILOG(8)
 
 // r, g, b, dst_1, dst_2
-static void yuv2rgb_c_8_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
-	const uint8_t *d32= dither_8x8_32[v_pos&7];
-	const uint8_t *d64= dither_8x8_73[v_pos&7];
+PROLOG(yuv2rgb_c_8_ordered_dither, uint8_t)
+	const uint8_t *d32= dither_8x8_32[y&7];
+	const uint8_t *d64= dither_8x8_73[y&7];
 #define DST1bpp8(i,o)					\
 	Y = py_1[2*i];				\
 	dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]];	\
@@ -583,32 +423,12 @@ static void yuv2rgb_c_8_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2bpp8(3,6);
 	DST1bpp8(3,6);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 8;
-	dst_2 += 8;
-    }
-}
+EPILOG(8)
 
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-static void yuv2rgb_c_4  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_4, uint8_t)
         int acc;
 #define DST1_4(i)					\
 	Y = py_1[2*i];				\
@@ -639,31 +459,11 @@ static void yuv2rgb_c_4  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2_4(3);
 	DST1_4(3);
+EPILOG(4)
 
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 4;
-	dst_2 += 4;
-    }
-}
-
-static void yuv2rgb_c_4_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
-	const uint8_t *d64= dither_8x8_73[v_pos&7];
-	const uint8_t *d128=dither_8x8_220[v_pos&7];
+PROLOG(yuv2rgb_c_4_ordered_dither, uint8_t)
+	const uint8_t *d64= dither_8x8_73[y&7];
+	const uint8_t *d128=dither_8x8_220[y&7];
         int acc;
 
 #define DST1bpp4(i,o)					\
@@ -696,31 +496,11 @@ static void yuv2rgb_c_4_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2bpp4(3,6);
 	DST1bpp4(3,6);
-
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 4;
-	dst_2 += 4;
-    }
-}
+EPILOG(4)
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-static void yuv2rgb_c_4b  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
+PROLOG(yuv2rgb_c_4b, uint8_t)
 	RGB(0);
 	DST1(0);
 	DST2(0);
@@ -736,31 +516,11 @@ static void yuv2rgb_c_4b  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2(3);
 	DST1(3);
+EPILOG(8)
 
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 8;
-	dst_2 += 8;
-    }
-}
-
-static void yuv2rgb_c_4b_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int U, V, Y;
-    uint8_t * r, * g, * b;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-
-    while (h_size--) {
-	const uint8_t *d64= dither_8x8_73[v_pos&7];
-	const uint8_t *d128=dither_8x8_220[v_pos&7];
+PROLOG(yuv2rgb_c_4b_ordered_dither, uint8_t)
+	const uint8_t *d64= dither_8x8_73[y&7];
+	const uint8_t *d128=dither_8x8_220[y&7];
 
 #define DST1bpp4b(i,o)					\
 	Y = py_1[2*i];				\
@@ -790,31 +550,11 @@ static void yuv2rgb_c_4b_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 	RGB(3);
 	DST2bpp4b(3,6);
 	DST1bpp4b(3,6);
+EPILOG(8)
 
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 += 8;
-	dst_2 += 8;
-    }
-}
-
-static void yuv2rgb_c_1_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
-			  uint8_t * pu, uint8_t * pv,
-			  void * _dst_1, void * _dst_2, int h_size, int v_pos)
-{
-    int Y;
-    uint8_t * g;
-    uint8_t * dst_1, * dst_2;
-
-    h_size >>= 3;
-    dst_1 = _dst_1;
-    dst_2 = _dst_2;
-    g= table_gU[128] + table_gV[128];
-
-    while (h_size--) {
-	const uint8_t *d128=dither_8x8_220[v_pos&7];
+PROLOG(yuv2rgb_c_1_ordered_dither, uint8_t)
+	g= c->table_gU[128] + c->table_gV[128];
+	const uint8_t *d128=dither_8x8_220[y&7];
 	char out_1=0, out_2=0;
 
 #define DST1bpp1(i,o)					\
@@ -843,17 +583,59 @@ static void yuv2rgb_c_1_ordered_dither  (uint8_t * py_1, uint8_t * py_2,
 	
 	dst_1[0]= out_1;
 	dst_2[0]= out_2;
+EPILOG(1)
 
-	pu += 4;
-	pv += 4;
-	py_1 += 8;
-	py_2 += 8;
-	dst_1 ++;
-	dst_2 ++;
+SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
+{
+#ifdef ARCH_X86
+    if(gCpuCaps.hasMMX2){
+	switch(c->dstFormat){
+	case IMGFMT_BGR32: return yuv420_rgb32_MMX2;
+	case IMGFMT_BGR24: return yuv420_rgb24_MMX2;
+	case IMGFMT_BGR16: return yuv420_rgb16_MMX2;
+	case IMGFMT_BGR15: return yuv420_rgb15_MMX2;
+	}
     }
+    if(gCpuCaps.hasMMX){
+	switch(c->dstFormat){
+	case IMGFMT_BGR32: return yuv420_rgb32_MMX;
+	case IMGFMT_BGR24: return yuv420_rgb24_MMX;
+	case IMGFMT_BGR16: return yuv420_rgb16_MMX;
+	case IMGFMT_BGR15: return yuv420_rgb15_MMX;
+	}
+    }
+#endif
+#ifdef HAVE_MLIB
+    {
+	SwsFunc t= yuv2rgb_init_mlib(c);
+	if(t) return t;
+    }
+#endif
+    mp_msg(MSGT_SWS,MSGL_WARN,"No accelerated colorspace conversion found\n");
+
+    switch(c->dstFormat){
+    case IMGFMT_RGB32:
+    case IMGFMT_BGR32: return yuv2rgb_c_32;
+    case IMGFMT_RGB24: return yuv2rgb_c_24_rgb;
+    case IMGFMT_BGR24: return yuv2rgb_c_24_bgr;
+    case IMGFMT_RGB16:
+    case IMGFMT_BGR16:
+    case IMGFMT_RGB15:
+    case IMGFMT_BGR15: return yuv2rgb_c_16;
+    case IMGFMT_RGB8:
+    case IMGFMT_BGR8:  return yuv2rgb_c_8_ordered_dither;
+    case IMGFMT_RGB4:
+    case IMGFMT_BGR4:  return yuv2rgb_c_4_ordered_dither;
+    case IMGFMT_RG4B:
+    case IMGFMT_BG4B:  return yuv2rgb_c_4b_ordered_dither;
+    case IMGFMT_RGB1:
+    case IMGFMT_BGR1:  return yuv2rgb_c_1_ordered_dither;
+    default:
+    	assert(0);
+    }
+    return NULL;
 }
 
-
 static int div_round (int dividend, int divisor)
 {
     if (dividend > 0)
@@ -862,8 +644,10 @@ static int div_round (int dividend, int divisor)
 	return -((-dividend + (divisor>>1)) / divisor);
 }
 
-void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_gU[256], int table_gV[256], void *table_bU[256])
+int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation)
 {  
+    const int bpp= c->dstFormat&0xFF;
+    const int isRgb= (c->dstFormat>>24) != 'R';
     int i;
     uint8_t table_Y[1024];
     uint32_t *table_32 = 0;
@@ -876,23 +660,37 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
     void *table_r = 0, *table_g = 0, *table_b = 0;
     void *table_start;
 
-    int crv = Inverse_Table_6_9[matrix_coefficients][0];
-    int cbu = Inverse_Table_6_9[matrix_coefficients][1];
-    int cgu = -Inverse_Table_6_9[matrix_coefficients][2];
-    int cgv = -Inverse_Table_6_9[matrix_coefficients][3];
+    int64_t crv =  inv_table[0];
+    int64_t cbu =  inv_table[1];
+    int64_t cgu = -inv_table[2];
+    int64_t cgv = -inv_table[3];
+    int64_t cy  = 1<<16;
+    int64_t oy  = 0;
+
+//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
+    if(!fullRange){
+	cy= (cy*255) / 219;
+	oy= 16<<16;
+    }
+	
+    cy = (cy *contrast             )>>16;
+    crv= (crv*contrast * saturation)>>32;
+    cbu= (cbu*contrast * saturation)>>32;
+    cgu= (cgu*contrast * saturation)>>32;
+    cgv= (cgv*contrast * saturation)>>32;
+//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
+    oy -= 256*brightness;
 
     for (i = 0; i < 1024; i++) {
 	int j;
 
-	j = (76309 * (i - 384 - 16) + 32768) >> 16;
+	j= (cy*(((i - 384)<<16) - oy) + (1<<31))>>32;
 	j = (j < 0) ? 0 : ((j > 255) ? 255 : j);
 	table_Y[i] = j;
     }
 
     switch (bpp) {
     case 32:
-	yuv2rgb_c_internal = yuv2rgb_c_32;
-
 	table_start= table_32 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t));
 
 	entry_size = sizeof (uint32_t);
@@ -901,17 +699,14 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	table_g = table_32 + 197 + 2*682;
 
 	for (i = -197; i < 256+197; i++)
-	    ((uint32_t *)table_r)[i] = table_Y[i+384] << ((mode==MODE_RGB) ? 16 : 0);
+	    ((uint32_t *)table_r)[i] = table_Y[i+384] << (isRgb ? 16 : 0);
 	for (i = -132; i < 256+132; i++)
 	    ((uint32_t *)table_g)[i] = table_Y[i+384] << 8;
 	for (i = -232; i < 256+232; i++)
-	    ((uint32_t *)table_b)[i] = table_Y[i+384] << ((mode==MODE_RGB) ? 0 : 16);
+	    ((uint32_t *)table_b)[i] = table_Y[i+384] << (isRgb ? 0 : 16);
 	break;
 
     case 24:
-//	yuv2rgb_c_internal = (mode==MODE_RGB) ? yuv2rgb_c_24_rgb : yuv2rgb_c_24_bgr;
-	yuv2rgb_c_internal = (mode!=MODE_RGB) ? yuv2rgb_c_24_rgb : yuv2rgb_c_24_bgr;
-
 	table_start= table_8 = malloc ((256 + 2*232) * sizeof (uint8_t));
 
 	entry_size = sizeof (uint8_t);
@@ -923,8 +718,6 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 
     case 15:
     case 16:
-	yuv2rgb_c_internal = yuv2rgb_c_16;
-
 	table_start= table_16 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t));
 
 	entry_size = sizeof (uint16_t);
@@ -935,7 +728,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -197; i < 256+197; i++) {
 	    int j = table_Y[i+384] >> 3;
 
-	    if (mode == MODE_RGB)
+	    if (isRgb)
 		j <<= ((bpp==16) ? 11 : 10);
 
 	    ((uint16_t *)table_r)[i] = j;
@@ -948,7 +741,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -232; i < 256+232; i++) {
 	    int j = table_Y[i+384] >> 3;
 
-	    if (mode == MODE_BGR)
+	    if (!isRgb)
 		j <<= ((bpp==16) ? 11 : 10);
 
 	    ((uint16_t *)table_b)[i] = j;
@@ -956,8 +749,6 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	break;
 
     case 8:
-	yuv2rgb_c_internal = yuv2rgb_c_8_ordered_dither; //yuv2rgb_c_8;
-
 	table_start= table_332 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
 
 	entry_size = sizeof (uint8_t);
@@ -968,7 +759,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -197; i < 256+197; i++) {
 	    int j = (table_Y[i+384 - 16] + 18)/36;
 
-	    if (mode == MODE_RGB)
+	    if (isRgb)
 		j <<= 5;
 
 	    ((uint8_t *)table_r)[i] = j;
@@ -976,7 +767,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -132; i < 256+132; i++) {
 	    int j = (table_Y[i+384 - 16] + 18)/36;
 
-	    if (mode == MODE_BGR)
+	    if (!isRgb)
 		j <<= 1;
 
 	    ((uint8_t *)table_g)[i] = j << 2;
@@ -984,7 +775,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -232; i < 256+232; i++) {
 	    int j = (table_Y[i+384 - 37] + 43)/85;
 
-	    if (mode == MODE_BGR)
+	    if (!isRgb)
 		j <<= 6;
 
 	    ((uint8_t *)table_b)[i] = j;
@@ -992,11 +783,6 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	break;
     case 4:
     case 4|128:
-        if(bpp==4)
-	    yuv2rgb_c_internal = yuv2rgb_c_4_ordered_dither; //yuv2rgb_c_4;
-        else
-	    yuv2rgb_c_internal = yuv2rgb_c_4b_ordered_dither; //yuv2rgb_c_4;
-
 	table_start= table_121 = malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
 
 	entry_size = sizeof (uint8_t);
@@ -1007,7 +793,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -197; i < 256+197; i++) {
 	    int j = table_Y[i+384 - 110] >> 7;
 
-	    if (mode == MODE_RGB)
+	    if (isRgb)
 		j <<= 3;
 
 	    ((uint8_t *)table_r)[i] = j;
@@ -1020,7 +806,7 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	for (i = -232; i < 256+232; i++) {
 	    int j =table_Y[i+384 - 110] >> 7;
 
-	    if (mode == MODE_BGR)
+	    if (!isRgb)
 		j <<= 3;
 
 	    ((uint8_t *)table_b)[i] = j;
@@ -1028,8 +814,6 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
 	break;
 
     case 1:
-	yuv2rgb_c_internal = yuv2rgb_c_1_ordered_dither;
-
 	table_start= table_1 = malloc (256*2 * sizeof (uint8_t));
 
 	entry_size = sizeof (uint8_t);
@@ -1046,15 +830,18 @@ void *yuv2rgb_c_init (unsigned bpp, int mode, void *table_rV[256], void *table_g
     default:
 	table_start= NULL;
 	mp_msg(MSGT_SWS,MSGL_ERR,"%ibpp not supported by yuv2rgb\n", bpp);
-	//exit (1);
+	//free mem?
+	return -1;
     }
 
     for (i = 0; i < 256; i++) {
-	table_rV[i] = table_r + entry_size * div_round (crv * (i-128), 76309);
-	table_gU[i] = table_g + entry_size * div_round (cgu * (i-128), 76309);
-	table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
-	table_bU[i] = table_b + entry_size * div_round (cbu * (i-128), 76309);
+	c->table_rV[i] = table_r + entry_size * div_round (crv * (i-128), 76309);
+	c->table_gU[i] = table_g + entry_size * div_round (cgu * (i-128), 76309);
+	c->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
+	c->table_bU[i] = table_b + entry_size * div_round (cbu * (i-128), 76309);
     }
 
-    return table_start; 
+    if(c->yuvTable) free(c->yuvTable);
+    c->yuvTable= table_start;
+    return 0;
 }
diff --git a/postproc/yuv2rgb_mlib.c b/postproc/yuv2rgb_mlib.c
index a75cf4de9b..142aa1deb0 100644
--- a/postproc/yuv2rgb_mlib.c
+++ b/postproc/yuv2rgb_mlib.c
@@ -26,52 +26,72 @@
 #include <mlib_status.h>
 #include <mlib_sys.h>
 #include <mlib_video.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <assert.h>
 
-static void mlib_YUV2ARGB420_32(uint8_t* image, uint8_t* py, 
-			 uint8_t* pu, uint8_t* pv, 
-			 unsigned h_size, unsigned v_size, 
-			 int rgb_stride, int y_stride, int uv_stride)
-{
-  mlib_VideoColorYUV2ARGB420(image, py, pu, pv, h_size,
-			     v_size, rgb_stride, y_stride, uv_stride);
-}
+#include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
+#include "swscale.h"
 
-static void mlib_YUV2ABGR420_32(uint8_t* image, uint8_t* py, 
-			 uint8_t* pu, uint8_t* pv, 
-			 unsigned h_size, unsigned v_size, 
-			 int rgb_stride, int y_stride, int uv_stride)
-{
-  mlib_VideoColorYUV2ABGR420(image, py, pu, pv, h_size,
-			     v_size, rgb_stride, y_stride, uv_stride);
+static void mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY, 
+             int srcSliceH, uint8_t* dst[], int dstStride[]){
+    uint8_t *src[3];
+    int srcStride[3];
+
+    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+    if(c->srcFormat == IMGFMT_422P){
+	srcStride[1] *= 2;
+	srcStride[2] *= 2;
+    }
+    
+    assert(srcStride[1] == srcStride[2]);
+ 
+    mlib_VideoColorYUV2ARGB420(dst[0], src[0], src[1], src[2], c->dstW,
+			     c->dstH, dstStride[0], srcStride[0], srcStride[1]);
 }
 
-static void mlib_YUV2RGB420_24(uint8_t* image, uint8_t* py, 
-			 uint8_t* pu, uint8_t* pv, 
-			 unsigned h_size, unsigned v_size, 
-			 int rgb_stride, int y_stride, int uv_stride)
-{
-  mlib_VideoColorYUV2RGB420(image, py, pu, pv, h_size,
-			    v_size, rgb_stride, y_stride, uv_stride);
+static void mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY, 
+             int srcSliceH, uint8_t* dst[], int dstStride[]){
+    uint8_t *src[3];
+    int srcStride[3];
+
+    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+    if(c->srcFormat == IMGFMT_422P){
+	srcStride[1] *= 2;
+	srcStride[2] *= 2;
+    }
+    
+    assert(srcStride[1] == srcStride[2]);
+ 
+    mlib_VideoColorYUV2ABGR420(dst[0], src[0], src[1], src[2], c->dstW,
+			     c->dstH, dstStride[0], srcStride[0], srcStride[1]);
 }
 
+static void mlib_YUV2RGB420_24(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY, 
+             int srcSliceH, uint8_t* dst[], int dstStride[]){
+    uint8_t *src[3];
+    int srcStride[3];
 
-yuv2rgb_fun yuv2rgb_init_mlib(unsigned bpp, int mode) 
-{  
+    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+    if(c->srcFormat == IMGFMT_422P){
+	srcStride[1] *= 2;
+	srcStride[2] *= 2;
+    }
+    
+    assert(srcStride[1] == srcStride[2]);
+ 
+    mlib_VideoColorYUV2RGB420(dst[0], src[0], src[1], src[2], c->dstW,
+			     c->dstH, dstStride[0], srcStride[0], srcStride[1]);
+}
 
-	if( bpp == 24 ) 
-	{
-		if( mode == MODE_RGB )
-			return mlib_YUV2RGB420_24;
-  }
 
-	if( bpp == 32 ) 
-	{
-		if( mode == MODE_RGB )
-			return mlib_YUV2ARGB420_32;
-		else if( mode == MODE_BGR )
-			return mlib_YUV2ABGR420_32;
+SwsFunc yuv2rgb_init_mlib(SwsContext *c) 
+{
+	switch(c->dstFormat){
+	case IMGFMT_RGB24: return mlib_YUV2RGB420_24;
+	case IMGFMT_RGB32: return mlib_YUV2ARGB420_32;
+	case IMGFMT_BGR32: return mlib_YUV2ARGB420_32;
+	default: return NULL;
 	}
-  
-	return NULL;
 }
 
diff --git a/postproc/yuv2rgb_template.c b/postproc/yuv2rgb_template.c
index d6b2097808..7b4abf6a95 100644
--- a/postproc/yuv2rgb_template.c
+++ b/postproc/yuv2rgb_template.c
@@ -25,6 +25,7 @@
  *
  * 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
  * MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
+ * context / deglobalize stuff by Michael Niedermayer
  */
 
 #undef MOVNTQ
@@ -56,26 +57,24 @@
 		     "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
 		     "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
 \
-		     "psubsw "MANGLE(mmx_80w)", %%mm0;" /* Cb -= 128 */ \
-		     "psubsw "MANGLE(mmx_80w)", %%mm1;" /* Cr -= 128 */ \
-\
 		     "psllw $3, %%mm0;" /* Promote precision */ \
 		     "psllw $3, %%mm1;" /* Promote precision */ \
 \
+		     "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \
+		     "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \
+\
 		     "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
 		     "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
 \
-		     "pmulhw "MANGLE(mmx_U_green)", %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
-		     "pmulhw "MANGLE(mmx_V_green)", %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
+		     "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
+		     "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
 \
-		     "pmulhw "MANGLE(mmx_U_blue)", %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
-		     "pmulhw "MANGLE(mmx_V_red)", %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
+		     "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
+		     "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
 \
 		     "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
 \
 		     /* convert the luma part */\
-		     "psubusb "MANGLE(mmx_10w)", %%mm6;" /* Y -= 16 */\
-\
 		     "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
 		     "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
 \
@@ -84,8 +83,11 @@
 		     "psllw $3, %%mm6;" /* Promote precision */\
 		     "psllw $3, %%mm7;" /* Promote precision */\
 \
-		     "pmulhw "MANGLE(mmx_Y_coeff)", %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
-		     "pmulhw "MANGLE(mmx_Y_coeff)", %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
+		     "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\
+		     "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\
+\
+		     "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
+		     "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
 \
 		     /* Do the addition part of the conversion for even and odd pixels,
 			register usage:
@@ -121,44 +123,44 @@
 		     "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\
 
 
-static inline void RENAME(yuv420_rgb16) (uint8_t * image, uint8_t * py,
-			      uint8_t * pu, uint8_t * pv,
-			      unsigned h_size, unsigned v_size,
-			      int rgb_stride, int y_stride, int uv_stride)
-{
-    int even = 1;
-    int x, y;
+static inline void RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
+             int srcSliceH, uint8_t* dst[], int dstStride[]){
+    int srcStride[3];
+    uint8_t *src[3];
+    int y, h_size;
 
-    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
-
-    for (y = v_size; --y >= 0; ) {
-	uint8_t *_image = image;
-	uint8_t *_py = py;
-	uint8_t *_pu = pu;
-	uint8_t *_pv = pv;
-	int internal_h_size= h_size;
-	int aligned_h_size= (h_size+7)&~7;
+    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+    if(c->srcFormat == IMGFMT_422P){
+	srcStride[1] *= 2;
+	srcStride[2] *= 2;
+    }
 
-	if(rgb_stride >= aligned_h_size*2) internal_h_size= aligned_h_size;
+    h_size= (c->dstW+7)&~7;
+    if(h_size*2 > dstStride[0]) h_size-=8;
+    
+    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
+//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
+//srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
+    for (y= 0; y<srcSliceH; y++ ) {
+	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
+	uint8_t *_py = src[0] + y*srcStride[0];
+	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
+	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
+	int index= -h_size/2;
 
 	b5Dither= dither8[y&1];
 	g6Dither= dither4[y&1];
 	g5Dither= dither8[y&1];
 	r5Dither= dither8[(y+1)&1];
-
-	/* load data for start of next scan line */
-	__asm__ __volatile__ (
-		 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-		 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-		 "movq (%0), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-
-		 : : "r" (_py), "r" (_pu), "r" (_pv));
-
-	for (x = internal_h_size >> 3; --x >= 0; ) {
 	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
 	       pixels in each iteration */
-
 	    __asm__ __volatile__ (
+	/* load data for start of next scan line */
+		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+//		    ".balign 16			\n\t"
+		    "1:				\n\t"
 /* no speed diference on my p3@500 with prefetch,
  * if it is faster for anyone with -benchmark then tell me
 			PREFETCH" 64(%0) \n\t"
@@ -190,80 +192,71 @@ YUV2RGB
 		     "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
 		     "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
 
-		     "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-		     MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
+		     "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+		     MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
 
 		     /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
 		     "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
 		     "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
 
 		     "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
-		     "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+		     "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 
 		     "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
-		     "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-
-		     MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
-		     : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
-
-	    _py += 8;
-	    _pu += 4;
-	    _pv += 4;
-	    _image += 16;
-	}
-
-	if (!even) {
-	    pu += uv_stride;
-	    pv += uv_stride;
-	}
-
-	py += y_stride;
-	image += rgb_stride;
-
-	even = (!even);
+		     "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+
+		     MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
+		     
+		     "addl $16, %1			\n\t"
+		     "addl $4, %0			\n\t"
+		     " js 1b				\n\t"
+		     
+		     : "+r" (index), "+r" (_image)
+		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
+		     );
     }
 
     __asm__ __volatile__ (EMMS);
 }
 
-static inline void RENAME(yuv420_rgb15) (uint8_t * image, uint8_t * py,
-			      uint8_t * pu, uint8_t * pv,
-			      unsigned h_size, unsigned v_size,
-			      int rgb_stride, int y_stride, int uv_stride)
-{
-    int even = 1;
-    int x, y;
+static inline void RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
+             int srcSliceH, uint8_t* dst[], int dstStride[]){
+    int srcStride[3];
+    uint8_t *src[3];
+    int y, h_size;
 
-    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
-
-    for (y = v_size; --y >= 0; ) {
-	uint8_t *_image = image;
-	uint8_t *_py = py;
-	uint8_t *_pu = pu;
-	uint8_t *_pv = pv;
-	int internal_h_size= h_size;
-	int aligned_h_size= (h_size+7)&~7;
+    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+    if(c->srcFormat == IMGFMT_422P){
+	srcStride[1] *= 2;
+	srcStride[2] *= 2;
+    }
 
-	if(rgb_stride >= aligned_h_size*2) internal_h_size= aligned_h_size;
+    h_size= (c->dstW+7)&~7;
+    if(h_size*2 > dstStride[0]) h_size-=8;
+    
+    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
+//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
+//srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
+    for (y= 0; y<srcSliceH; y++ ) {
+	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
+	uint8_t *_py = src[0] + y*srcStride[0];
+	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
+	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
+	int index= -h_size/2;
 
 	b5Dither= dither8[y&1];
 	g6Dither= dither4[y&1];
 	g5Dither= dither8[y&1];
 	r5Dither= dither8[(y+1)&1];
-
-	/* load data for start of next scan line */
-	__asm__ __volatile__ (
-		 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-		 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-		 "movq (%0), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-
-		 : : "r" (_py), "r" (_pu), "r" (_pv));
-
-	for (x = internal_h_size >> 3; --x >= 0; ) {
 	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
 	       pixels in each iteration */
-
 	    __asm__ __volatile__ (
+	/* load data for start of next scan line */
+		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+//		    ".balign 16			\n\t"
+		    "1:				\n\t"
 YUV2RGB
 
 #ifdef DITHER1XBPP
@@ -291,75 +284,65 @@ YUV2RGB
 		     "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
 		     "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
 
-		     "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-		     MOVNTQ " %%mm0, (%3);" /* store pixel 0-3 */
+		     "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+		     MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
 
 		     /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
 		     "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
 		     "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
 
 		     "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
-		     "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+		     "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 
 		     "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
-		     "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-
-		     MOVNTQ " %%mm5, 8 (%3);" /* store pixel 4-7 */
-		     : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
-
-	    _py += 8;
-	    _pu += 4;
-	    _pv += 4;
-	    _image += 16;
-	}
-
-	if (!even) {
-	    pu += uv_stride;
-	    pv += uv_stride;
-	}
-
-	py += y_stride;
-	image += rgb_stride;
-
-	even = (!even);
+		     "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+
+		     MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
+		     
+		     "addl $16, %1			\n\t"
+		     "addl $4, %0			\n\t"
+		     " js 1b				\n\t"
+		     : "+r" (index), "+r" (_image)
+		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
+		     );
     }
 
     __asm__ __volatile__ (EMMS);
 }
 
-static inline void RENAME(yuv420_rgb24) (uint8_t * image, uint8_t * py,
-			      uint8_t * pu, uint8_t * pv,
-			      unsigned h_size, unsigned v_size,
-			      int rgb_stride, int y_stride, int uv_stride)
-{
-    int even = 1;
-    int x, y;
+static inline void RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
+             int srcSliceH, uint8_t* dst[], int dstStride[]){
+    int srcStride[3];
+    uint8_t *src[3];
+    int y, h_size;
 
-    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
-
-    for (y = v_size; --y >= 0; ) {
-	uint8_t *_image = image;
-	uint8_t *_py = py;
-	uint8_t *_pu = pu;
-	uint8_t *_pv = pv;
-	int internal_h_size= h_size;
-	int aligned_h_size= (h_size+7)&~7;
-
-	if(rgb_stride >= aligned_h_size*3) internal_h_size= aligned_h_size;
+    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+    if(c->srcFormat == IMGFMT_422P){
+	srcStride[1] *= 2;
+	srcStride[2] *= 2;
+    }
 
-	/* load data for start of next scan line */
-	__asm__ __volatile__ (
-		 "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-		 "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-		 "movq (%0), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+    h_size= (c->dstW+7)&~7;
+    if(h_size*3 > dstStride[0]) h_size-=8;
+    
+    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
 
-		 : : "r" (_py), "r" (_pu), "r" (_pv));
+    for (y= 0; y<srcSliceH; y++ ) {
+	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
+	uint8_t *_py = src[0] + y*srcStride[0];
+	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
+	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
+	int index= -h_size/2;
 
-	for (x = internal_h_size >> 3; --x >= 0; ) {
 	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
 	       pixels in each iteration */
-
 	    __asm__ __volatile__ (
+	/* load data for start of next scan line */
+		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+//		    ".balign 16			\n\t"
+		    "1:				\n\t"
 YUV2RGB
 	/* mm0=B, %%mm2=G, %%mm1=R */
 #ifdef HAVE_MMX2
@@ -376,7 +359,7 @@ YUV2RGB
 			"psllq $8, %%mm3		\n\t" /* G2        G1       G0    */
 			"por %%mm5, %%mm6		\n\t"
 			"por %%mm3, %%mm6		\n\t"
-			MOVNTQ" %%mm6, (%3)		\n\t"
+			MOVNTQ" %%mm6, (%1)		\n\t"
 
 			"psrlq $8, %%mm2		\n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */
 			"pshufw $0xA5, %%mm0, %%mm5	\n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */
@@ -389,22 +372,22 @@ YUV2RGB
 
 			"por %%mm5, %%mm3		\n\t" /* B5    G4 B4     G3 B3    */
 			"por %%mm3, %%mm6		\n\t"
-			MOVNTQ" %%mm6, 8(%3)		\n\t"
+			MOVNTQ" %%mm6, 8(%1)		\n\t"
 
 			"pshufw $0xFF, %%mm0, %%mm5	\n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */
 			"pshufw $0xFA, %%mm2, %%mm3	\n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */
 			"pshufw $0xFA, %%mm1, %%mm6	\n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */
-			"movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+			"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 
 			"pand %%mm7, %%mm5		\n\t" /*       B7        B6       */
 			"pand %%mm4, %%mm3		\n\t" /*    G7        G6       G5 */
 			"pand "MANGLE(M24B)", %%mm6	\n\t" /* R7       R6        R5    */
-			"movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+			"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 \
 			"por %%mm5, %%mm3		\n\t"
 			"por %%mm3, %%mm6		\n\t"
-			MOVNTQ" %%mm6, 16(%3)		\n\t"
-			"movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+			MOVNTQ" %%mm6, 16(%1)		\n\t"
+			"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 			"pxor %%mm4, %%mm4		\n\t"
 
 #else
@@ -442,83 +425,72 @@ YUV2RGB
 			"movq %%mm0, %%mm6		\n\t" /* 0RGBRGB0 1 */
 			"psllq $40, %%mm0		\n\t" /* GB000000 1 */
 			"por %%mm0, %%mm7		\n\t" /* GBRGBRGB 0 */
-			MOVNTQ" %%mm7, (%3)		\n\t"
+			MOVNTQ" %%mm7, (%1)		\n\t"
 
-			"movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+			"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
 
 			"psrlq $24, %%mm6		\n\t" /* 0000RGBR 1 */
 			"movq %%mm5, %%mm1		\n\t" /* 0RGBRGB0 2 */
 			"psllq $24, %%mm5		\n\t" /* BRGB0000 2 */
 			"por %%mm5, %%mm6		\n\t" /* BRGBRGBR 1 */
-			MOVNTQ" %%mm6, 8(%3)		\n\t"
+			MOVNTQ" %%mm6, 8(%1)		\n\t"
 
-			"movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+			"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
 
 			"psrlq $40, %%mm1		\n\t" /* 000000RG 2 */
 			"psllq $8, %%mm3		\n\t" /* RGBRGB00 3 */
 			"por %%mm3, %%mm1		\n\t" /* RGBRGBRG 2 */
-			MOVNTQ" %%mm1, 16(%3)		\n\t"
+			MOVNTQ" %%mm1, 16(%1)		\n\t"
 
-			"movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+			"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 			"pxor %%mm4, %%mm4		\n\t"
 #endif
-
-		     : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
-
-	    _py += 8;
-	    _pu += 4;
-	    _pv += 4;
-	    _image += 24;
-	}
-
-	if (!even) {
-	    pu += uv_stride;
-	    pv += uv_stride;
-	}
-
-	py += y_stride;
-	image += rgb_stride;
-
-	even = (!even);
+		     
+		     "addl $24, %1			\n\t"
+		     "addl $4, %0			\n\t"
+		     " js 1b				\n\t"
+		     
+		     : "+r" (index), "+r" (_image)
+		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
+		     );
     }
 
     __asm__ __volatile__ (EMMS);
 }
 
+static inline void RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
+             int srcSliceH, uint8_t* dst[], int dstStride[]){
+    int srcStride[3];
+    uint8_t *src[3];
+    int y, h_size;
 
-static inline void RENAME(yuv420_argb32) (uint8_t * image, uint8_t * py,
-			       uint8_t * pu, uint8_t * pv,
-			       unsigned h_size, unsigned v_size,
-			       int rgb_stride, int y_stride, int uv_stride)
-{
-    int even = 1;
-    int x, y;
+    sws_orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
+    if(c->srcFormat == IMGFMT_422P){
+	srcStride[1] *= 2;
+	srcStride[2] *= 2;
+    }
 
+    h_size= (c->dstW+7)&~7;
+    if(h_size*4 > dstStride[0]) h_size-=8;
+    
     __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
 
-    for (y = v_size; --y >= 0; ) {
-	uint8_t *_image = image;
-	uint8_t *_py = py;
-	uint8_t *_pu = pu;
-	uint8_t *_pv = pv;
-	int internal_h_size= h_size;
-	int aligned_h_size= (h_size+7)&~7;
-
-	if(rgb_stride >= aligned_h_size*4) internal_h_size= aligned_h_size;
+    for (y= 0; y<srcSliceH; y++ ) {
+	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
+	uint8_t *_py = src[0] + y*srcStride[0];
+	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
+	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
+	int index= -h_size/2;
 
-	/* load data for start of next scan line */
-	__asm__ __volatile__ 
-	    (
-	     "movd (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ 
-	     "movd (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ 
-	     "movq (%0), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ 
-	     : : "r" (_py), "r" (_pu), "r" (_pv)
-	     );
-
-	for (x = internal_h_size >> 3; --x >= 0; ) {
 	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
 	       pixels in each iteration */
 	    __asm__ __volatile__ (
+	/* load data for start of next scan line */
+		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+//		    ".balign 16			\n\t"
+		    "1:				\n\t"
 YUV2RGB
 		     /* convert RGB plane to RGB packed format,
 			mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
@@ -536,60 +508,40 @@ YUV2RGB
 		     "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */
 
 		     "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */
-		     MOVNTQ " %%mm6, (%3);" /* Store ARGB1 ARGB0 */
+		     MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */
 
 		     "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
 		     "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
 
 		     "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */
-		     MOVNTQ " %%mm6, 8 (%3);" /* Store ARGB3 ARGB2 */
+		     MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */
 
 		     "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
 		     "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */
 
 		     "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */
-		     MOVNTQ " %%mm4, 16 (%3);" /* Store ARGB5 ARGB4 */
+		     MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */
 
 		     "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
 		     "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
 
 		     "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */
-		     MOVNTQ " %%mm4, 24 (%3);" /* Store ARGB7 ARGB6 */
+		     MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */
 
-		     "movd 4 (%1), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-		     "movd 4 (%2), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
+		     "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
+		     "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
 
 		     "pxor %%mm4, %%mm4;" /* zero mm4 */
-		     "movq 8 (%0), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-
-		     : : "r" (_py), "r" (_pu), "r" (_pv), "r" (_image));
-
-	    _py += 8;
-	    _pu += 4;
-	    _pv += 4;
-	    _image += 32;
-	}
-
-	if (!even) {
-	    pu += uv_stride;
-	    pv += uv_stride;
-	}
-
-	py += y_stride;
-	image += rgb_stride;
-
-	even = (!even);
+		     "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
+
+		     "addl $32, %1			\n\t"
+		     "addl $4, %0			\n\t"
+		     " js 1b				\n\t"
+		     
+		     : "+r" (index), "+r" (_image)
+		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
+		     );
     }
 
     __asm__ __volatile__ (EMMS);
 }
-
-yuv2rgb_fun RENAME(yuv2rgb_init) (unsigned bpp, int mode)
-{
-    if (bpp == 15 && mode == MODE_RGB) return RENAME(yuv420_rgb15);
-    if (bpp == 16 && mode == MODE_RGB) return RENAME(yuv420_rgb16);
-    if (bpp == 24 && mode == MODE_RGB) return RENAME(yuv420_rgb24);
-    if (bpp == 32 && mode == MODE_RGB) return RENAME(yuv420_argb32);
-    return NULL; // Fallback to C.
-}
-