From dcb7c868ec7af7d3a138b3254ef2e08f074d8ec5 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Wed, 27 Aug 2014 02:58:07 +0200
Subject: cosmetics: Make naming scheme of Xvid IDCT consistent with other
 IDCTs

---
 libavcodec/x86/Makefile         |   4 +-
 libavcodec/x86/dct-test.c       |   8 +-
 libavcodec/x86/idct_mmx_xvid.c  | 545 ---------------------------------------
 libavcodec/x86/idct_sse2_xvid.c | 405 -----------------------------
 libavcodec/x86/idct_xvid.h      |  43 ----
 libavcodec/x86/xvididct.h       |  43 ++++
 libavcodec/x86/xvididct_init.c  |  23 +-
 libavcodec/x86/xvididct_mmx.c   | 547 ++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/xvididct_sse2.c  | 406 +++++++++++++++++++++++++++++
 9 files changed, 1014 insertions(+), 1010 deletions(-)
 delete mode 100644 libavcodec/x86/idct_mmx_xvid.c
 delete mode 100644 libavcodec/x86/idct_sse2_xvid.c
 delete mode 100644 libavcodec/x86/idct_xvid.h
 create mode 100644 libavcodec/x86/xvididct.h
 create mode 100644 libavcodec/x86/xvididct_mmx.c
 create mode 100644 libavcodec/x86/xvididct_sse2.c

(limited to 'libavcodec/x86')

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index d06c5a12b2..3cec000e1d 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -66,8 +66,8 @@ MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/idctdsp_mmx.o             \
 MMX-OBJS-$(CONFIG_QPELDSP)             += x86/fpel_mmx.o
 
 # decoders/encoders
-MMX-OBJS-$(CONFIG_MPEG4_DECODER)       += x86/idct_mmx_xvid.o           \
-                                          x86/idct_sse2_xvid.o
+MMX-OBJS-$(CONFIG_MPEG4_DECODER)       += x86/xvididct_mmx.o            \
+                                          x86/xvididct_sse2.o
 MMX-OBJS-$(CONFIG_VC1_DECODER)         += x86/vc1dsp_mmx.o
 
 
diff --git a/libavcodec/x86/dct-test.c b/libavcodec/x86/dct-test.c
index d97c53c7d1..9d4aaf5415 100644
--- a/libavcodec/x86/dct-test.c
+++ b/libavcodec/x86/dct-test.c
@@ -19,7 +19,7 @@
 #include "config.h"
 
 #include "fdct.h"
-#include "idct_xvid.h"
+#include "xvididct.h"
 #include "simple_idct.h"
 
 static const struct algo fdct_tab_arch[] = {
@@ -41,13 +41,13 @@ static const struct algo idct_tab_arch[] = {
 #endif
 #if CONFIG_MPEG4_DECODER
 #if HAVE_MMX_INLINE
-    { "XVID-MMX",    ff_idct_xvid_mmx,    FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMX,    1 },
+    { "XVID-MMX",    ff_xvid_idct_mmx,    FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMX,    1 },
 #endif
 #if HAVE_MMXEXT_INLINE
-    { "XVID-MMXEXT", ff_idct_xvid_mmxext, FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMXEXT, 1 },
+    { "XVID-MMXEXT", ff_xvid_idct_mmxext, FF_IDCT_PERM_NONE,   AV_CPU_FLAG_MMXEXT, 1 },
 #endif
 #if HAVE_SSE2_INLINE
-    { "XVID-SSE2",   ff_idct_xvid_sse2,   FF_IDCT_PERM_SSE2,   AV_CPU_FLAG_SSE2,   1 },
+    { "XVID-SSE2",   ff_xvid_idct_sse2,   FF_IDCT_PERM_SSE2,   AV_CPU_FLAG_SSE2,   1 },
 #endif
 #endif /* CONFIG_MPEG4_DECODER */
     { 0 }
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
deleted file mode 100644
index e5cafd5536..0000000000
--- a/libavcodec/x86/idct_mmx_xvid.c
+++ /dev/null
@@ -1,545 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - MMX and XMM forward discrete cosine transform -
- *
- * Copyright(C) 2001 Peter Ross <pross@xvid.org>
- *
- * Originally provided by Intel at AP-922
- * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
- * but in a limited edition.
- * New macro implements a column part for precise iDCT
- * The routine precision now satisfies IEEE standard 1180-1990.
- *
- * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
- * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
- *
- * http://www.elecard.com/peter/idct.html
- * http://www.linuxvideo.org/mpeg2dec/
- *
- * These examples contain code fragments for first stage iDCT 8x8
- * (for rows) and first stage DCT 8x8 (for columns)
- *
- * conversion to gcc syntax by Michael Niedermayer
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <inttypes.h>
-
-#include "config.h"
-#include "libavcodec/avcodec.h"
-#include "libavutil/mem.h"
-#include "idct_xvid.h"
-#include "idctdsp.h"
-
-#if HAVE_MMX_INLINE
-
-//-----------------------------------------------------------------------------
-// Various memory constants (trigonometric values or rounding values)
-//-----------------------------------------------------------------------------
-
-
-DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
-  13036,13036,13036,13036,        // tg * (2<<16) + 0.5
-  27146,27146,27146,27146,        // tg * (2<<16) + 0.5
-  -21746,-21746,-21746,-21746,    // tg * (2<<16) + 0.5
-  23170,23170,23170,23170};       // cos * (2<<15) + 0.5
-
-DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
-  65536,65536,
-  3597,3597,
-  2260,2260,
-  1203,1203,
-  0,0,
-  120,120,
-  512,512,
-  512,512};
-
-//-----------------------------------------------------------------------------
-//
-// The first stage iDCT 8x8 - inverse DCTs of rows
-//
-//-----------------------------------------------------------------------------
-// The 8-point inverse DCT direct algorithm
-//-----------------------------------------------------------------------------
-//
-// static const short w[32] = {
-//       FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
-//       FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
-//       FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
-//       FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
-//       FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
-//       FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
-//       FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
-//       FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
-//
-// #define DCT_8_INV_ROW(x, y)
-// {
-//       int a0, a1, a2, a3, b0, b1, b2, b3;
-//
-//       a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3];
-//       a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7];
-//       a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
-//       a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
-//       b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
-//       b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
-//       b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
-//       b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
-//
-//       y[0] = SHIFT_ROUND ( a0 + b0 );
-//       y[1] = SHIFT_ROUND ( a1 + b1 );
-//       y[2] = SHIFT_ROUND ( a2 + b2 );
-//       y[3] = SHIFT_ROUND ( a3 + b3 );
-//       y[4] = SHIFT_ROUND ( a3 - b3 );
-//       y[5] = SHIFT_ROUND ( a2 - b2 );
-//       y[6] = SHIFT_ROUND ( a1 - b1 );
-//       y[7] = SHIFT_ROUND ( a0 - b0 );
-// }
-//
-//-----------------------------------------------------------------------------
-//
-// In this implementation the outputs of the iDCT-1D are multiplied
-//       for rows 0,4 - by cos_4_16,
-//       for rows 1,7 - by cos_1_16,
-//       for rows 2,6 - by cos_2_16,
-//       for rows 3,5 - by cos_3_16
-// and are shifted to the left for better accuracy
-//
-// For the constants used,
-//       FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
-//
-//-----------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------
-// Tables for mmx processors
-//-----------------------------------------------------------------------------
-
-// Table for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32*4] = {
-  16384,16384,16384,-16384,       // movq-> w06 w04 w02 w00
-  21407,8867,8867,-21407,         // w07 w05 w03 w01
-  16384,-16384,16384,16384,       // w14 w12 w10 w08
-  -8867,21407,-21407,-8867,       // w15 w13 w11 w09
-  22725,12873,19266,-22725,       // w22 w20 w18 w16
-  19266,4520,-4520,-12873,        // w23 w21 w19 w17
-  12873,4520,4520,19266,          // w30 w28 w26 w24
-  -22725,19266,-12873,-22725,     // w31 w29 w27 w25
-// Table for rows 1,7 - constants are multiplied by cos_1_16
-  22725,22725,22725,-22725,       // movq-> w06 w04 w02 w00
-  29692,12299,12299,-29692,       // w07 w05 w03 w01
-  22725,-22725,22725,22725,       // w14 w12 w10 w08
-  -12299,29692,-29692,-12299,     // w15 w13 w11 w09
-  31521,17855,26722,-31521,       // w22 w20 w18 w16
-  26722,6270,-6270,-17855,        // w23 w21 w19 w17
-  17855,6270,6270,26722,          // w30 w28 w26 w24
-  -31521,26722,-17855,-31521,     // w31 w29 w27 w25
-// Table for rows 2,6 - constants are multiplied by cos_2_16
-  21407,21407,21407,-21407,       // movq-> w06 w04 w02 w00
-  27969,11585,11585,-27969,       // w07 w05 w03 w01
-  21407,-21407,21407,21407,       // w14 w12 w10 w08
-  -11585,27969,-27969,-11585,     // w15 w13 w11 w09
-  29692,16819,25172,-29692,       // w22 w20 w18 w16
-  25172,5906,-5906,-16819,        // w23 w21 w19 w17
-  16819,5906,5906,25172,          // w30 w28 w26 w24
-  -29692,25172,-16819,-29692,     // w31 w29 w27 w25
-// Table for rows 3,5 - constants are multiplied by cos_3_16
-  19266,19266,19266,-19266,       // movq-> w06 w04 w02 w00
-  25172,10426,10426,-25172,       // w07 w05 w03 w01
-  19266,-19266,19266,19266,       // w14 w12 w10 w08
-  -10426,25172,-25172,-10426,     // w15 w13 w11 w09
-  26722,15137,22654,-26722,       // w22 w20 w18 w16
-  22654,5315,-5315,-15137,        // w23 w21 w19 w17
-  15137,5315,5315,22654,          // w30 w28 w26 w24
-  -26722,22654,-15137,-26722,     // w31 w29 w27 w25
-};
-//-----------------------------------------------------------------------------
-// Tables for xmm processors
-//-----------------------------------------------------------------------------
-
-// %3 for rows 0,4 - constants are multiplied by cos_4_16
-DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32*4] = {
-  16384,21407,16384,8867,      // movq-> w05 w04 w01 w00
-  16384,8867,-16384,-21407,    // w07 w06 w03 w02
-  16384,-8867,16384,-21407,    // w13 w12 w09 w08
-  -16384,21407,16384,-8867,    // w15 w14 w11 w10
-  22725,19266,19266,-4520,     // w21 w20 w17 w16
-  12873,4520,-22725,-12873,    // w23 w22 w19 w18
-  12873,-22725,4520,-12873,    // w29 w28 w25 w24
-  4520,19266,19266,-22725,     // w31 w30 w27 w26
-// %3 for rows 1,7 - constants are multiplied by cos_1_16
-  22725,29692,22725,12299,     // movq-> w05 w04 w01 w00
-  22725,12299,-22725,-29692,   // w07 w06 w03 w02
-  22725,-12299,22725,-29692,   // w13 w12 w09 w08
-  -22725,29692,22725,-12299,   // w15 w14 w11 w10
-  31521,26722,26722,-6270,     // w21 w20 w17 w16
-  17855,6270,-31521,-17855,    // w23 w22 w19 w18
-  17855,-31521,6270,-17855,    // w29 w28 w25 w24
-  6270,26722,26722,-31521,     // w31 w30 w27 w26
-// %3 for rows 2,6 - constants are multiplied by cos_2_16
-  21407,27969,21407,11585,     // movq-> w05 w04 w01 w00
-  21407,11585,-21407,-27969,   // w07 w06 w03 w02
-  21407,-11585,21407,-27969,   // w13 w12 w09 w08
-  -21407,27969,21407,-11585,   // w15 w14 w11 w10
-  29692,25172,25172,-5906,     // w21 w20 w17 w16
-  16819,5906,-29692,-16819,    // w23 w22 w19 w18
-  16819,-29692,5906,-16819,    // w29 w28 w25 w24
-  5906,25172,25172,-29692,     // w31 w30 w27 w26
-// %3 for rows 3,5 - constants are multiplied by cos_3_16
-  19266,25172,19266,10426,     // movq-> w05 w04 w01 w00
-  19266,10426,-19266,-25172,   // w07 w06 w03 w02
-  19266,-10426,19266,-25172,   // w13 w12 w09 w08
-  -19266,25172,19266,-10426,   // w15 w14 w11 w10
-  26722,22654,22654,-5315,     // w21 w20 w17 w16
-  15137,5315,-26722,-15137,    // w23 w22 w19 w18
-  15137,-26722,5315,-15137,    // w29 w28 w25 w24
-  5315,22654,22654,-26722,     // w31 w30 w27 w26
-};
-//=============================================================================
-// Helper macros for the code
-//=============================================================================
-
-//-----------------------------------------------------------------------------
-// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
-//-----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\
-  "movq " #A1 ",%%mm0              \n\t"/* 0 ; x3 x2 x1 x0*/\
-  "movq 8+" #A1 ",%%mm1            \n\t"/* 1 ; x7 x6 x5 x4*/\
-  "movq %%mm0,%%mm2            \n\t"/* 2  ; x3 x2 x1 x0*/\
-  "movq " #A3 ",%%mm3              \n\t"/* 3 ; w06 w04 w02 w00*/\
-  "punpcklwd %%mm1,%%mm0       \n\t"/* x5 x1 x4 x0*/\
-  "movq %%mm0,%%mm5            \n\t"/* 5 ; x5 x1 x4 x0*/\
-  "punpckldq %%mm0,%%mm0       \n\t"/* x4 x0 x4 x0*/\
-  "movq 8+" #A3 ",%%mm4            \n\t"/* 4 ; w07 w05 w03 w01*/\
-  "punpckhwd %%mm1,%%mm2       \n\t"/* 1 ; x7 x3 x6 x2*/\
-  "pmaddwd %%mm0,%%mm3         \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\
-  "movq %%mm2,%%mm6            \n\t"/* 6 ; x7 x3 x6 x2*/\
-  "movq 32+" #A3 ",%%mm1           \n\t"/* 1 ; w22 w20 w18 w16*/\
-  "punpckldq %%mm2,%%mm2       \n\t"/* x6 x2 x6 x2*/\
-  "pmaddwd %%mm2,%%mm4         \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\
-  "punpckhdq %%mm5,%%mm5       \n\t"/* x5 x1 x5 x1*/\
-  "pmaddwd 16+" #A3 ",%%mm0        \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\
-  "punpckhdq %%mm6,%%mm6       \n\t"/* x7 x3 x7 x3*/\
-  "movq 40+" #A3 ",%%mm7           \n\t"/* 7 ; w23 w21 w19 w17*/\
-  "pmaddwd %%mm5,%%mm1         \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\
-  "paddd " #A4 ",%%mm3             \n\t"/* +%4*/\
-  "pmaddwd %%mm6,%%mm7         \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\
-  "pmaddwd 24+" #A3 ",%%mm2        \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\
-  "paddd %%mm4,%%mm3           \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
-  "pmaddwd 48+" #A3 ",%%mm5        \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\
-  "movq %%mm3,%%mm4            \n\t"/* 4 ; a1 a0*/\
-  "pmaddwd 56+" #A3 ",%%mm6        \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\
-  "paddd %%mm7,%%mm1           \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
-  "paddd " #A4 ",%%mm0             \n\t"/* +%4*/\
-  "psubd %%mm1,%%mm3           \n\t"/* a1-b1 a0-b0*/\
-  "psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\
-  "paddd %%mm4,%%mm1           \n\t"/* 4 ; a1+b1 a0+b0*/\
-  "paddd %%mm2,%%mm0           \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\
-  "psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\
-  "paddd %%mm6,%%mm5           \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\
-  "movq %%mm0,%%mm4            \n\t"/* 4 ; a3 a2*/\
-  "paddd %%mm5,%%mm0           \n\t"/* a3+b3 a2+b2*/\
-  "psubd %%mm5,%%mm4           \n\t"/* 5 ; a3-b3 a2-b2*/\
-  "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
-  "psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\
-  "packssdw %%mm0,%%mm1        \n\t"/* 0 ; y3 y2 y1 y0*/\
-  "packssdw %%mm3,%%mm4        \n\t"/* 3 ; y6 y7 y4 y5*/\
-  "movq %%mm4,%%mm7            \n\t"/* 7 ; y6 y7 y4 y5*/\
-  "psrld $16,%%mm4            \n\t"/* 0 y6 0 y4*/\
-  "pslld $16,%%mm7            \n\t"/* y7 0 y5 0*/\
-  "movq %%mm1," #A2 "              \n\t"/* 1 ; save y3 y2 y1 y0*/\
-  "por %%mm4,%%mm7             \n\t"/* 4 ; y7 y6 y5 y4*/\
-  "movq %%mm7,8            +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
-
-
-//-----------------------------------------------------------------------------
-// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
-//-----------------------------------------------------------------------------
-
-#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\
-  "movq " #A1 ",%%mm0                  \n\t"/* 0     ; x3 x2 x1 x0*/\
-  "movq 8+" #A1 ",%%mm1                \n\t"/* 1     ; x7 x6 x5 x4*/\
-  "movq %%mm0,%%mm2                \n\t"/* 2     ; x3 x2 x1 x0*/\
-  "movq " #A3 ",%%mm3                  \n\t"/* 3     ; w05 w04 w01 w00*/\
-  "pshufw $0x88,%%mm0,%%mm0        \n\t"/* x2 x0 x2 x0*/\
-  "movq 8+" #A3 ",%%mm4                \n\t"/* 4     ; w07 w06 w03 w02*/\
-  "movq %%mm1,%%mm5                \n\t"/* 5     ; x7 x6 x5 x4*/\
-  "pmaddwd %%mm0,%%mm3             \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\
-  "movq 32+" #A3 ",%%mm6               \n\t"/* 6     ; w21 w20 w17 w16*/\
-  "pshufw $0x88,%%mm1,%%mm1        \n\t"/* x6 x4 x6 x4*/\
-  "pmaddwd %%mm1,%%mm4             \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\
-  "movq 40+" #A3 ",%%mm7               \n\t"/* 7    ; w23 w22 w19 w18*/\
-  "pshufw $0xdd,%%mm2,%%mm2        \n\t"/* x3 x1 x3 x1*/\
-  "pmaddwd %%mm2,%%mm6             \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\
-  "pshufw $0xdd,%%mm5,%%mm5        \n\t"/* x7 x5 x7 x5*/\
-  "pmaddwd %%mm5,%%mm7             \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\
-  "paddd " #A4 ",%%mm3                 \n\t"/* +%4*/\
-  "pmaddwd 16+" #A3 ",%%mm0            \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\
-  "paddd %%mm4,%%mm3               \n\t"/* 4     ; a1=sum(even1) a0=sum(even0)*/\
-  "pmaddwd 24+" #A3 ",%%mm1            \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\
-  "movq %%mm3,%%mm4                \n\t"/* 4     ; a1 a0*/\
-  "pmaddwd 48+" #A3 ",%%mm2            \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\
-  "paddd %%mm7,%%mm6               \n\t"/* 7     ; b1=sum(odd1) b0=sum(odd0)*/\
-  "pmaddwd 56+" #A3 ",%%mm5            \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\
-  "paddd %%mm6,%%mm3               \n\t"/* a1+b1 a0+b0*/\
-  "paddd " #A4 ",%%mm0                 \n\t"/* +%4*/\
-  "psrad $11,%%mm3     \n\t"/* y1=a1+b1 y0=a0+b0*/\
-  "paddd %%mm1,%%mm0               \n\t"/* 1     ; a3=sum(even3) a2=sum(even2)*/\
-  "psubd %%mm6,%%mm4               \n\t"/* 6     ; a1-b1 a0-b0*/\
-  "movq %%mm0,%%mm7                \n\t"/* 7     ; a3 a2*/\
-  "paddd %%mm5,%%mm2               \n\t"/* 5     ; b3=sum(odd3) b2=sum(odd2)*/\
-  "paddd %%mm2,%%mm0               \n\t"/* a3+b3 a2+b2*/\
-  "psrad $11,%%mm4     \n\t"/* y6=a1-b1 y7=a0-b0*/\
-  "psubd %%mm2,%%mm7               \n\t"/* 2     ; a3-b3 a2-b2*/\
-  "psrad $11,%%mm0     \n\t"/* y3=a3+b3 y2=a2+b2*/\
-  "psrad $11,%%mm7     \n\t"/* y4=a3-b3 y5=a2-b2*/\
-  "packssdw %%mm0,%%mm3            \n\t"/* 0     ; y3 y2 y1 y0*/\
-  "packssdw %%mm4,%%mm7            \n\t"/* 4     ; y6 y7 y4 y5*/\
-  "movq %%mm3, " #A2 "                  \n\t"/* 3     ; save y3 y2 y1 y0*/\
-  "pshufw $0xb1,%%mm7,%%mm7        \n\t"/* y7 y6 y5 y4*/\
-  "movq %%mm7,8                +" #A2 "\n\t"/* 7     ; save y7 y6 y5 y4*/\
-
-
-//-----------------------------------------------------------------------------
-//
-// The first stage DCT 8x8 - forward DCTs of columns
-//
-// The %2puts are multiplied
-// for rows 0,4 - on cos_4_16,
-// for rows 1,7 - on cos_1_16,
-// for rows 2,6 - on cos_2_16,
-// for rows 3,5 - on cos_3_16
-// and are shifted to the left for rise of accuracy
-//
-//-----------------------------------------------------------------------------
-//
-// The 8-point scaled forward DCT algorithm (26a8m)
-//
-//-----------------------------------------------------------------------------
-//
-// #define DCT_8_FRW_COL(x, y)
-//{
-// short t0, t1, t2, t3, t4, t5, t6, t7;
-// short tp03, tm03, tp12, tm12, tp65, tm65;
-// short tp465, tm465, tp765, tm765;
-//
-// t0 = LEFT_SHIFT ( x[0] + x[7] );
-// t1 = LEFT_SHIFT ( x[1] + x[6] );
-// t2 = LEFT_SHIFT ( x[2] + x[5] );
-// t3 = LEFT_SHIFT ( x[3] + x[4] );
-// t4 = LEFT_SHIFT ( x[3] - x[4] );
-// t5 = LEFT_SHIFT ( x[2] - x[5] );
-// t6 = LEFT_SHIFT ( x[1] - x[6] );
-// t7 = LEFT_SHIFT ( x[0] - x[7] );
-//
-// tp03 = t0 + t3;
-// tm03 = t0 - t3;
-// tp12 = t1 + t2;
-// tm12 = t1 - t2;
-//
-// y[0] = tp03 + tp12;
-// y[4] = tp03 - tp12;
-//
-// y[2] = tm03 + tm12 * tg_2_16;
-// y[6] = tm03 * tg_2_16 - tm12;
-//
-// tp65 =(t6 +t5 )*cos_4_16;
-// tm65 =(t6 -t5 )*cos_4_16;
-//
-// tp765 = t7 + tp65;
-// tm765 = t7 - tp65;
-// tp465 = t4 + tm65;
-// tm465 = t4 - tm65;
-//
-// y[1] = tp765 + tp465 * tg_1_16;
-// y[7] = tp765 * tg_1_16 - tp465;
-// y[5] = tm765 * tg_3_16 + tm465;
-// y[3] = tm765 - tm465 * tg_3_16;
-//}
-//
-//-----------------------------------------------------------------------------
-
-//-----------------------------------------------------------------------------
-// DCT_8_INV_COL_4  INP,OUT
-//-----------------------------------------------------------------------------
-
-#define DCT_8_INV_COL(A1,A2)\
-  "movq 2*8(%3),%%mm0\n\t"\
-  "movq 16*3+" #A1 ",%%mm3\n\t"\
-  "movq %%mm0,%%mm1            \n\t"/* tg_3_16*/\
-  "movq 16*5+" #A1 ",%%mm5\n\t"\
-  "pmulhw %%mm3,%%mm0          \n\t"/* x3*(tg_3_16-1)*/\
-  "movq (%3),%%mm4\n\t"\
-  "pmulhw %%mm5,%%mm1          \n\t"/* x5*(tg_3_16-1)*/\
-  "movq 16*7+" #A1 ",%%mm7\n\t"\
-  "movq %%mm4,%%mm2            \n\t"/* tg_1_16*/\
-  "movq 16*1+" #A1 ",%%mm6\n\t"\
-  "pmulhw %%mm7,%%mm4          \n\t"/* x7*tg_1_16*/\
-  "paddsw %%mm3,%%mm0          \n\t"/* x3*tg_3_16*/\
-  "pmulhw %%mm6,%%mm2          \n\t"/* x1*tg_1_16*/\
-  "paddsw %%mm3,%%mm1          \n\t"/* x3+x5*(tg_3_16-1)*/\
-  "psubsw %%mm5,%%mm0          \n\t"/* x3*tg_3_16-x5 = tm35*/\
-  "movq 3*8(%3),%%mm3\n\t"\
-  "paddsw %%mm5,%%mm1          \n\t"/* x3+x5*tg_3_16 = tp35*/\
-  "paddsw %%mm6,%%mm4          \n\t"/* x1+tg_1_16*x7 = tp17*/\
-  "psubsw %%mm7,%%mm2          \n\t"/* x1*tg_1_16-x7 = tm17*/\
-  "movq %%mm4,%%mm5            \n\t"/* tp17*/\
-  "movq %%mm2,%%mm6            \n\t"/* tm17*/\
-  "paddsw %%mm1,%%mm5          \n\t"/* tp17+tp35 = b0*/\
-  "psubsw %%mm0,%%mm6          \n\t"/* tm17-tm35 = b3*/\
-  "psubsw %%mm1,%%mm4          \n\t"/* tp17-tp35 = t1*/\
-  "paddsw %%mm0,%%mm2          \n\t"/* tm17+tm35 = t2*/\
-  "movq 1*8(%3),%%mm7\n\t"\
-  "movq %%mm4,%%mm1            \n\t"/* t1*/\
-  "movq %%mm5,3*16         +" #A2 "\n\t"/* save b0*/\
-  "paddsw %%mm2,%%mm1          \n\t"/* t1+t2*/\
-  "movq %%mm6,5*16         +" #A2 "\n\t"/* save b3*/\
-  "psubsw %%mm2,%%mm4          \n\t"/* t1-t2*/\
-  "movq 2*16+" #A1 ",%%mm5\n\t"\
-  "movq %%mm7,%%mm0            \n\t"/* tg_2_16*/\
-  "movq 6*16+" #A1 ",%%mm6\n\t"\
-  "pmulhw %%mm5,%%mm0          \n\t"/* x2*tg_2_16*/\
-  "pmulhw %%mm6,%%mm7          \n\t"/* x6*tg_2_16*/\
-  "pmulhw %%mm3,%%mm1          \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\
-  "movq 0*16+" #A1 ",%%mm2\n\t"\
-  "pmulhw %%mm3,%%mm4          \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\
-  "psubsw %%mm6,%%mm0          \n\t"/* t2*tg_2_16-x6 = tm26*/\
-  "movq %%mm2,%%mm3            \n\t"/* x0*/\
-  "movq 4*16+" #A1 ",%%mm6\n\t"\
-  "paddsw %%mm5,%%mm7          \n\t"/* x2+x6*tg_2_16 = tp26*/\
-  "paddsw %%mm6,%%mm2          \n\t"/* x0+x4 = tp04*/\
-  "psubsw %%mm6,%%mm3          \n\t"/* x0-x4 = tm04*/\
-  "movq %%mm2,%%mm5            \n\t"/* tp04*/\
-  "movq %%mm3,%%mm6            \n\t"/* tm04*/\
-  "psubsw %%mm7,%%mm2          \n\t"/* tp04-tp26 = a3*/\
-  "paddsw %%mm0,%%mm3          \n\t"/* tm04+tm26 = a1*/\
-  "paddsw %%mm1,%%mm1          \n\t"/* b1*/\
-  "paddsw %%mm4,%%mm4          \n\t"/* b2*/\
-  "paddsw %%mm7,%%mm5          \n\t"/* tp04+tp26 = a0*/\
-  "psubsw %%mm0,%%mm6          \n\t"/* tm04-tm26 = a2*/\
-  "movq %%mm3,%%mm7            \n\t"/* a1*/\
-  "movq %%mm6,%%mm0            \n\t"/* a2*/\
-  "paddsw %%mm1,%%mm3          \n\t"/* a1+b1*/\
-  "paddsw %%mm4,%%mm6          \n\t"/* a2+b2*/\
-  "psraw $6,%%mm3 \n\t"/* dst1*/\
-  "psubsw %%mm1,%%mm7          \n\t"/* a1-b1*/\
-  "psraw $6,%%mm6 \n\t"/* dst2*/\
-  "psubsw %%mm4,%%mm0          \n\t"/* a2-b2*/\
-  "movq 3*16+" #A2 ",%%mm1         \n\t"/* load b0*/\
-  "psraw $6,%%mm7 \n\t"/* dst6*/\
-  "movq %%mm5,%%mm4            \n\t"/* a0*/\
-  "psraw $6,%%mm0 \n\t"/* dst5*/\
-  "movq %%mm3,1*16+" #A2 "\n\t"\
-  "paddsw %%mm1,%%mm5          \n\t"/* a0+b0*/\
-  "movq %%mm6,2*16+" #A2 "\n\t"\
-  "psubsw %%mm1,%%mm4          \n\t"/* a0-b0*/\
-  "movq 5*16+" #A2 ",%%mm3         \n\t"/* load b3*/\
-  "psraw $6,%%mm5 \n\t"/* dst0*/\
-  "movq %%mm2,%%mm6            \n\t"/* a3*/\
-  "psraw $6,%%mm4 \n\t"/* dst7*/\
-  "movq %%mm0,5*16+" #A2 "\n\t"\
-  "paddsw %%mm3,%%mm2          \n\t"/* a3+b3*/\
-  "movq %%mm7,6*16+" #A2 "\n\t"\
-  "psubsw %%mm3,%%mm6          \n\t"/* a3-b3*/\
-  "movq %%mm5,0*16+" #A2 "\n\t"\
-  "psraw $6,%%mm2 \n\t"/* dst3*/\
-  "movq %%mm4,7*16+" #A2 "\n\t"\
-  "psraw $6,%%mm6 \n\t"/* dst4*/\
-  "movq %%mm2,3*16+" #A2 "\n\t"\
-  "movq %%mm6,4*16+" #A2 "\n\t"
-
-//=============================================================================
-// Code
-//=============================================================================
-
-//-----------------------------------------------------------------------------
-// void idct_mmx(uint16_t block[64]);
-//-----------------------------------------------------------------------------
-
-
-void ff_idct_xvid_mmx(short *block){
-__asm__ volatile(
-            //# Process each row
-    DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
-    DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
-    DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
-    DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
-    DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
-    DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
-    DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
-    DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
-
-            //# Process the columns (4 at a time)
-    DCT_8_INV_COL(0(%0), 0(%0))
-    DCT_8_INV_COL(8(%0), 8(%0))
-    :: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16));
-}
-
-void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_idct_xvid_mmx(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_idct_xvid_mmx(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMX_INLINE */
-
-#if HAVE_MMXEXT_INLINE
-
-//-----------------------------------------------------------------------------
-// void idct_xmm(uint16_t block[64]);
-//-----------------------------------------------------------------------------
-
-
-void ff_idct_xvid_mmxext(short *block)
-{
-__asm__ volatile(
-            //# Process each row
-    DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
-    DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
-    DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
-    DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
-    DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
-    DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
-    DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
-    DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
-
-            //# Process the columns (4 at a time)
-    DCT_8_INV_COL(0(%0), 0(%0))
-    DCT_8_INV_COL(8(%0), 8(%0))
-    :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
-}
-
-void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_idct_xvid_mmxext(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_idct_xvid_mmxext(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_MMXEXT_INLINE */
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
deleted file mode 100644
index 3bbc6220f6..0000000000
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- * - SSE2 inverse discrete cosine transform -
- *
- * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
- *
- * Conversion to gcc syntax with modifications
- * by Alexander Strange <astrange@ithinksw.com>
- *
- * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
- *
- * This file is part of Libav.
- *
- * Vertical pass is an implementation of the scheme:
- *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
- *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
- *  Proc. ICASSP 1989, 988-991.
- *
- * Horizontal pass is a double 4x4 vector/matrix multiplication,
- * (see also Intel's Application Note 922:
- *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
- *  Copyright (C) 1999 Intel Corporation)
- *
- * More details at http://skal.planet-d.net/coding/dct.html
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with Libav; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/internal.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-#include "idct_xvid.h"
-#include "idctdsp.h"
-
-#if HAVE_SSE2_INLINE
-
-/**
- * @file
- * @brief SSE2 IDCT compatible with the Xvid IDCT
- */
-
-#define X8(x)     x,x,x,x,x,x,x,x
-
-DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16)
-DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
-DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1
-DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2)
-DECLARE_ASM_CONST(8,  uint8_t, m127)[] = {X8(127)};
-
-DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
- 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
- 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
- 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
- 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
- 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
- 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
- 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
- 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
- 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
- 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
- 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
- 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
-};
-
-DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
- 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
- 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
- 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
- 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
-};
-
-DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
- 65536, 65536, 65536, 65536,
-  3597,  3597,  3597,  3597,
-  2260,  2260,  2260,  2260,
-  1203,  1203,  1203,  1203,
-   120,   120,   120,   120,
-   512,   512,   512,   512
-};
-
-// Temporary storage before the column pass
-#define ROW1 "%%xmm6"
-#define ROW3 "%%xmm4"
-#define ROW5 "%%xmm5"
-#define ROW7 "%%xmm7"
-
-#define CLEAR_ODD(r) "pxor  "r","r" \n\t"
-#define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
-
-#if ARCH_X86_64
-
-# define ROW0 "%%xmm8"
-# define REG0 ROW0
-# define ROW2 "%%xmm9"
-# define REG2 ROW2
-# define ROW4 "%%xmm10"
-# define REG4 ROW4
-# define ROW6 "%%xmm11"
-# define REG6 ROW6
-# define CLEAR_EVEN(r) CLEAR_ODD(r)
-# define PUT_EVEN(dst) PUT_ODD(dst)
-# define XMMS "%%xmm12"
-# define MOV_32_ONLY "#"
-# define SREG2 REG2
-# define TAN3 "%%xmm13"
-# define TAN1 "%%xmm14"
-
-#else
-
-# define ROW0 "(%0)"
-# define REG0 "%%xmm4"
-# define ROW2 "2*16(%0)"
-# define REG2 "%%xmm4"
-# define ROW4 "4*16(%0)"
-# define REG4 "%%xmm6"
-# define ROW6 "6*16(%0)"
-# define REG6 "%%xmm6"
-# define CLEAR_EVEN(r)
-# define PUT_EVEN(dst) \
-    "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
-    "movdqa          %%xmm2, "dst"    \n\t"
-# define XMMS "%%xmm2"
-# define MOV_32_ONLY "movdqa "
-# define SREG2 "%%xmm7"
-# define TAN3 "%%xmm0"
-# define TAN1 "%%xmm2"
-
-#endif
-
-#define ROUND(x) "paddd   "MANGLE(x)
-
-#define JZ(reg, to)                         \
-    "testl     "reg","reg"            \n\t" \
-    "jz        "to"                   \n\t"
-
-#define JNZ(reg, to)                        \
-    "testl     "reg","reg"            \n\t" \
-    "jnz       "to"                   \n\t"
-
-#define TEST_ONE_ROW(src, reg, clear)       \
-    clear                                   \
-    "movq     "src", %%mm1            \n\t" \
-    "por    8+"src", %%mm1            \n\t" \
-    "paddusb  %%mm0, %%mm1            \n\t" \
-    "pmovmskb %%mm1, "reg"            \n\t"
-
-#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
-    clear1                                  \
-    clear2                                  \
-    "movq     "row1", %%mm1           \n\t" \
-    "por    8+"row1", %%mm1           \n\t" \
-    "movq     "row2", %%mm2           \n\t" \
-    "por    8+"row2", %%mm2           \n\t" \
-    "paddusb   %%mm0, %%mm1           \n\t" \
-    "paddusb   %%mm0, %%mm2           \n\t" \
-    "pmovmskb  %%mm1, "reg1"          \n\t" \
-    "pmovmskb  %%mm2, "reg2"          \n\t"
-
-///IDCT pass on rows.
-#define iMTX_MULT(src, table, rounder, put) \
-    "movdqa        "src", %%xmm3      \n\t" \
-    "movdqa       %%xmm3, %%xmm0      \n\t" \
-    "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
-    "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
-    "pmaddwd     "table", %%xmm0      \n\t" \
-    "pmaddwd  16+"table", %%xmm1      \n\t" \
-    "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
-    "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
-    "pmaddwd  32+"table", %%xmm2      \n\t" \
-    "pmaddwd  48+"table", %%xmm3      \n\t" \
-    "paddd        %%xmm1, %%xmm0      \n\t" \
-    "paddd        %%xmm3, %%xmm2      \n\t" \
-    rounder",     %%xmm0              \n\t" \
-    "movdqa       %%xmm2, %%xmm3      \n\t" \
-    "paddd        %%xmm0, %%xmm2      \n\t" \
-    "psubd        %%xmm3, %%xmm0      \n\t" \
-    "psrad           $11, %%xmm2      \n\t" \
-    "psrad           $11, %%xmm0      \n\t" \
-    "packssdw     %%xmm0, %%xmm2      \n\t" \
-    put                                     \
-    "1:                               \n\t"
-
-#define iLLM_HEAD                           \
-    "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
-    "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
-
-///IDCT pass on columns.
-#define iLLM_PASS(dct)                      \
-    "movdqa   "TAN3", %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "pmulhw   %%xmm5, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm5, %%xmm1          \n\t" \
-    "psubsw   %%xmm5, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, %%xmm1          \n\t" \
-    "pmulhw   %%xmm7, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "paddsw   %%xmm6, %%xmm3          \n\t" \
-    "psubsw   %%xmm7, "TAN1"          \n\t" \
-    "movdqa   %%xmm3, %%xmm7          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm7, %%xmm1          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
-    MOV_32_ONLY ROW2", "REG2"         \n\t" \
-    MOV_32_ONLY ROW6", "REG6"         \n\t" \
-    "movdqa   %%xmm7, %%xmm5          \n\t" \
-    "pmulhw   "REG6", %%xmm7          \n\t" \
-    "pmulhw   "REG2", %%xmm5          \n\t" \
-    "paddsw   "REG2", %%xmm7          \n\t" \
-    "psubsw   "REG6", %%xmm5          \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    MOV_32_ONLY ROW4", "REG4"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   "REG4", "REG0"          \n\t" \
-    "paddsw   "XMMS", "REG4"          \n\t" \
-    "movdqa   "REG4", "XMMS"          \n\t" \
-    "psubsw   %%xmm7, "REG4"          \n\t" \
-    "paddsw   "XMMS", %%xmm7          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   %%xmm7, %%xmm0          \n\t" \
-    "movdqa   "REG4", %%xmm4          \n\t" \
-    "psubsw   %%xmm1, %%xmm7          \n\t" \
-    "psubsw   "TAN1", "REG4"          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, %%xmm7          \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, "REG4"          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   "REG4", 4*16("dct")     \n\t" \
-    "movdqa   %%xmm7, 7*16("dct")     \n\t"
-
-///IDCT pass on columns, assuming rows 4-7 are zero.
-#define iLLM_PASS_SPARSE(dct)               \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   %%xmm4, "TAN3"          \n\t" \
-    "movdqa   %%xmm6, %%xmm3          \n\t" \
-    "pmulhw   %%xmm6, "TAN1"          \n\t" \
-    "movdqa   %%xmm4, %%xmm1          \n\t" \
-    "psubsw   %%xmm1, %%xmm3          \n\t" \
-    "paddsw   %%xmm6, %%xmm1          \n\t" \
-    "movdqa   "TAN1", %%xmm6          \n\t" \
-    "psubsw   "TAN3", "TAN1"          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   %%xmm3, %%xmm6          \n\t" \
-    "psubsw   "TAN3", %%xmm3          \n\t" \
-    "paddsw   %%xmm6, "TAN3"          \n\t" \
-    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
-    "pmulhw   %%xmm4, %%xmm3          \n\t" \
-    "pmulhw   %%xmm4, "TAN3"          \n\t" \
-    "paddsw   "TAN3", "TAN3"          \n\t" \
-    "paddsw   %%xmm3, %%xmm3          \n\t" \
-    "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
-    MOV_32_ONLY ROW2", "SREG2"        \n\t" \
-    "pmulhw   "SREG2", %%xmm5         \n\t" \
-    MOV_32_ONLY ROW0", "REG0"         \n\t" \
-    "movdqa   "REG0", %%xmm6          \n\t" \
-    "psubsw   "SREG2", %%xmm6         \n\t" \
-    "paddsw   "REG0", "SREG2"         \n\t" \
-    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm5, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm5          \n\t" \
-    "movdqa   %%xmm5, "XMMS"          \n\t" \
-    "psubsw   "TAN3", %%xmm5          \n\t" \
-    "paddsw   "XMMS", "TAN3"          \n\t" \
-    "movdqa   "REG0", "XMMS"          \n\t" \
-    "psubsw   %%xmm3, "REG0"          \n\t" \
-    "paddsw   "XMMS", %%xmm3          \n\t" \
-    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
-    "psraw        $6, %%xmm5          \n\t" \
-    "psraw        $6, "REG0"          \n\t" \
-    "psraw        $6, "TAN3"          \n\t" \
-    "psraw        $6, %%xmm3          \n\t" \
-    "movdqa   "TAN3", 1*16("dct")     \n\t" \
-    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
-    "movdqa   "REG0", 5*16("dct")     \n\t" \
-    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
-    "movdqa   "SREG2", %%xmm0         \n\t" \
-    "movdqa   %%xmm6, %%xmm4          \n\t" \
-    "psubsw   %%xmm1, "SREG2"         \n\t" \
-    "psubsw   "TAN1", %%xmm6          \n\t" \
-    "paddsw   %%xmm0, %%xmm1          \n\t" \
-    "paddsw   %%xmm4, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm1          \n\t" \
-    "psraw        $6, "SREG2"         \n\t" \
-    "psraw        $6, "TAN1"          \n\t" \
-    "psraw        $6, %%xmm6          \n\t" \
-    "movdqa   %%xmm1, ("dct")         \n\t" \
-    "movdqa   "TAN1", 3*16("dct")     \n\t" \
-    "movdqa   %%xmm6, 4*16("dct")     \n\t" \
-    "movdqa   "SREG2", 7*16("dct")    \n\t"
-
-inline void ff_idct_xvid_sse2(short *block)
-{
-    __asm__ volatile(
-    "movq     "MANGLE(m127)", %%mm0                              \n\t"
-    iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),      PUT_EVEN(ROW0))
-    iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
-    iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
-
-    TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
-    JZ("%%eax", "1f")
-    iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
-
-    TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
-    TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
-    iLLM_HEAD
-    ".p2align 4 \n\t"
-    JNZ("%%ecx", "2f")
-    JNZ("%%eax", "3f")
-    JNZ("%%edx", "4f")
-    JNZ("%%esi", "5f")
-    iLLM_PASS_SPARSE("%0")
-    "jmp 6f                                                      \n\t"
-    "2:                                                          \n\t"
-    iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
-    "3:                                                          \n\t"
-    iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
-    JZ("%%edx", "1f")
-    "4:                                                          \n\t"
-    iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
-    JZ("%%esi", "1f")
-    "5:                                                          \n\t"
-    iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
-#if ARCH_X86_32
-    iLLM_HEAD
-#endif
-    iLLM_PASS("%0")
-    "6:                                                          \n\t"
-    : "+r"(block)
-    :
-    : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
-                   "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
-#if ARCH_X86_64
-      XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
-                   "%xmm12", "%xmm13", "%xmm14",)
-#endif
-      "%eax", "%ecx", "%edx", "%esi", "memory"
-    );
-}
-
-void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
-{
-    ff_idct_xvid_sse2(block);
-    ff_put_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
-{
-    ff_idct_xvid_sse2(block);
-    ff_add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-#endif /* HAVE_SSE2_INLINE */
diff --git a/libavcodec/x86/idct_xvid.h b/libavcodec/x86/idct_xvid.h
deleted file mode 100644
index aea28bab96..0000000000
--- a/libavcodec/x86/idct_xvid.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * XVID MPEG-4 VIDEO CODEC
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * header for Xvid IDCT functions
- */
-
-#ifndef AVCODEC_X86_IDCT_XVID_H
-#define AVCODEC_X86_IDCT_XVID_H
-
-#include <stdint.h>
-
-void ff_idct_xvid_mmx(short *block);
-void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block);
-void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block);
-
-void ff_idct_xvid_mmxext(short *block);
-void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block);
-void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block);
-
-void ff_idct_xvid_sse2(short *block);
-void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block);
-void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block);
-
-#endif /* AVCODEC_X86_IDCT_XVID_H */
diff --git a/libavcodec/x86/xvididct.h b/libavcodec/x86/xvididct.h
new file mode 100644
index 0000000000..13a4e85890
--- /dev/null
+++ b/libavcodec/x86/xvididct.h
@@ -0,0 +1,43 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * header for Xvid IDCT functions
+ */
+
+#ifndef AVCODEC_X86_XVIDIDCT_H
+#define AVCODEC_X86_XVIDIDCT_H
+
+#include <stdint.h>
+
+void ff_xvid_idct_mmx(short *block);
+void ff_xvid_idct_mmx_put(uint8_t *dest, int line_size, int16_t *block);
+void ff_xvid_idct_mmx_add(uint8_t *dest, int line_size, int16_t *block);
+
+void ff_xvid_idct_mmxext(short *block);
+void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block);
+void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block);
+
+void ff_xvid_idct_sse2(short *block);
+void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block);
+void ff_xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block);
+
+#endif /* AVCODEC_X86_XVIDIDCT_H */
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index d5d8ac9630..ce5f240aee 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -22,31 +22,32 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/idctdsp.h"
 #include "libavcodec/xvididct.h"
-#include "idct_xvid.h"
+
 #include "idctdsp.h"
+#include "xvididct.h"
 
-av_cold void ff_xvididct_init_x86(IDCTDSPContext *c)
+av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
-        c->idct_put  = ff_idct_xvid_mmx_put;
-        c->idct_add  = ff_idct_xvid_mmx_add;
-        c->idct      = ff_idct_xvid_mmx;
+        c->idct_put  = ff_xvid_idct_mmx_put;
+        c->idct_add  = ff_xvid_idct_mmx_add;
+        c->idct      = ff_xvid_idct_mmx;
         c->perm_type = FF_IDCT_PERM_NONE;
     }
 
     if (INLINE_MMXEXT(cpu_flags)) {
-        c->idct_put  = ff_idct_xvid_mmxext_put;
-        c->idct_add  = ff_idct_xvid_mmxext_add;
-        c->idct      = ff_idct_xvid_mmxext;
+        c->idct_put  = ff_xvid_idct_mmxext_put;
+        c->idct_add  = ff_xvid_idct_mmxext_add;
+        c->idct      = ff_xvid_idct_mmxext;
         c->perm_type = FF_IDCT_PERM_NONE;
     }
 
     if (INLINE_SSE2(cpu_flags)) {
-        c->idct_put  = ff_idct_xvid_sse2_put;
-        c->idct_add  = ff_idct_xvid_sse2_add;
-        c->idct      = ff_idct_xvid_sse2;
+        c->idct_put  = ff_xvid_idct_sse2_put;
+        c->idct_add  = ff_xvid_idct_sse2_add;
+        c->idct      = ff_xvid_idct_sse2;
         c->perm_type = FF_IDCT_PERM_SSE2;
     }
 }
diff --git a/libavcodec/x86/xvididct_mmx.c b/libavcodec/x86/xvididct_mmx.c
new file mode 100644
index 0000000000..7fa7f71b25
--- /dev/null
+++ b/libavcodec/x86/xvididct_mmx.c
@@ -0,0 +1,547 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ * - MMX and XMM forward discrete cosine transform -
+ *
+ * Copyright(C) 2001 Peter Ross <pross@xvid.org>
+ *
+ * Originally provided by Intel at AP-922
+ * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+ * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
+ * but in a limited edition.
+ * New macro implements a column part for precise iDCT
+ * The routine precision now satisfies IEEE standard 1180-1990.
+ *
+ * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
+ * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
+ *
+ * http://www.elecard.com/peter/idct.html
+ * http://www.linuxvideo.org/mpeg2dec/
+ *
+ * These examples contain code fragments for first stage iDCT 8x8
+ * (for rows) and first stage DCT 8x8 (for columns)
+ *
+ * conversion to gcc syntax by Michael Niedermayer
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Libav; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+
+#include "config.h"
+#include "libavcodec/avcodec.h"
+#include "libavutil/mem.h"
+
+#include "idctdsp.h"
+#include "xvididct.h"
+
+#if HAVE_MMX_INLINE
+
+//-----------------------------------------------------------------------------
+// Various memory constants (trigonometric values or rounding values)
+//-----------------------------------------------------------------------------
+
+
+DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
+  13036,13036,13036,13036,        // tg * (2<<16) + 0.5
+  27146,27146,27146,27146,        // tg * (2<<16) + 0.5
+  -21746,-21746,-21746,-21746,    // tg * (2<<16) + 0.5
+  23170,23170,23170,23170};       // cos * (2<<15) + 0.5
+
+DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
+  65536,65536,
+  3597,3597,
+  2260,2260,
+  1203,1203,
+  0,0,
+  120,120,
+  512,512,
+  512,512};
+
+//-----------------------------------------------------------------------------
+//
+// The first stage iDCT 8x8 - inverse DCTs of rows
+//
+//-----------------------------------------------------------------------------
+// The 8-point inverse DCT direct algorithm
+//-----------------------------------------------------------------------------
+//
+// static const short w[32] = {
+//       FIX(cos_4_16),  FIX(cos_2_16),  FIX(cos_4_16),  FIX(cos_6_16),
+//       FIX(cos_4_16),  FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
+//       FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16),  FIX(cos_2_16),
+//       FIX(cos_4_16), -FIX(cos_2_16),  FIX(cos_4_16), -FIX(cos_6_16),
+//       FIX(cos_1_16),  FIX(cos_3_16),  FIX(cos_5_16),  FIX(cos_7_16),
+//       FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
+//       FIX(cos_5_16), -FIX(cos_1_16),  FIX(cos_7_16),  FIX(cos_3_16),
+//       FIX(cos_7_16), -FIX(cos_5_16),  FIX(cos_3_16), -FIX(cos_1_16) };
+//
+// #define DCT_8_INV_ROW(x, y)
+// {
+//       int a0, a1, a2, a3, b0, b1, b2, b3;
+//
+//       a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3];
+//       a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7];
+//       a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
+//       a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
+//       b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
+//       b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
+//       b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
+//       b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
+//
+//       y[0] = SHIFT_ROUND ( a0 + b0 );
+//       y[1] = SHIFT_ROUND ( a1 + b1 );
+//       y[2] = SHIFT_ROUND ( a2 + b2 );
+//       y[3] = SHIFT_ROUND ( a3 + b3 );
+//       y[4] = SHIFT_ROUND ( a3 - b3 );
+//       y[5] = SHIFT_ROUND ( a2 - b2 );
+//       y[6] = SHIFT_ROUND ( a1 - b1 );
+//       y[7] = SHIFT_ROUND ( a0 - b0 );
+// }
+//
+//-----------------------------------------------------------------------------
+//
+// In this implementation the outputs of the iDCT-1D are multiplied
+//       for rows 0,4 - by cos_4_16,
+//       for rows 1,7 - by cos_1_16,
+//       for rows 2,6 - by cos_2_16,
+//       for rows 3,5 - by cos_3_16
+// and are shifted to the left for better accuracy
+//
+// For the constants used,
+//       FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
+//
+//-----------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------
+// Tables for mmx processors
+//-----------------------------------------------------------------------------
+
+// Table for rows 0,4 - constants are multiplied by cos_4_16
+DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32*4] = {
+  16384,16384,16384,-16384,       // movq-> w06 w04 w02 w00
+  21407,8867,8867,-21407,         // w07 w05 w03 w01
+  16384,-16384,16384,16384,       // w14 w12 w10 w08
+  -8867,21407,-21407,-8867,       // w15 w13 w11 w09
+  22725,12873,19266,-22725,       // w22 w20 w18 w16
+  19266,4520,-4520,-12873,        // w23 w21 w19 w17
+  12873,4520,4520,19266,          // w30 w28 w26 w24
+  -22725,19266,-12873,-22725,     // w31 w29 w27 w25
+// Table for rows 1,7 - constants are multiplied by cos_1_16
+  22725,22725,22725,-22725,       // movq-> w06 w04 w02 w00
+  29692,12299,12299,-29692,       // w07 w05 w03 w01
+  22725,-22725,22725,22725,       // w14 w12 w10 w08
+  -12299,29692,-29692,-12299,     // w15 w13 w11 w09
+  31521,17855,26722,-31521,       // w22 w20 w18 w16
+  26722,6270,-6270,-17855,        // w23 w21 w19 w17
+  17855,6270,6270,26722,          // w30 w28 w26 w24
+  -31521,26722,-17855,-31521,     // w31 w29 w27 w25
+// Table for rows 2,6 - constants are multiplied by cos_2_16
+  21407,21407,21407,-21407,       // movq-> w06 w04 w02 w00
+  27969,11585,11585,-27969,       // w07 w05 w03 w01
+  21407,-21407,21407,21407,       // w14 w12 w10 w08
+  -11585,27969,-27969,-11585,     // w15 w13 w11 w09
+  29692,16819,25172,-29692,       // w22 w20 w18 w16
+  25172,5906,-5906,-16819,        // w23 w21 w19 w17
+  16819,5906,5906,25172,          // w30 w28 w26 w24
+  -29692,25172,-16819,-29692,     // w31 w29 w27 w25
+// Table for rows 3,5 - constants are multiplied by cos_3_16
+  19266,19266,19266,-19266,       // movq-> w06 w04 w02 w00
+  25172,10426,10426,-25172,       // w07 w05 w03 w01
+  19266,-19266,19266,19266,       // w14 w12 w10 w08
+  -10426,25172,-25172,-10426,     // w15 w13 w11 w09
+  26722,15137,22654,-26722,       // w22 w20 w18 w16
+  22654,5315,-5315,-15137,        // w23 w21 w19 w17
+  15137,5315,5315,22654,          // w30 w28 w26 w24
+  -26722,22654,-15137,-26722,     // w31 w29 w27 w25
+};
+//-----------------------------------------------------------------------------
+// Tables for xmm processors
+//-----------------------------------------------------------------------------
+
+// %3 for rows 0,4 - constants are multiplied by cos_4_16
+DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32*4] = {
+  16384,21407,16384,8867,      // movq-> w05 w04 w01 w00
+  16384,8867,-16384,-21407,    // w07 w06 w03 w02
+  16384,-8867,16384,-21407,    // w13 w12 w09 w08
+  -16384,21407,16384,-8867,    // w15 w14 w11 w10
+  22725,19266,19266,-4520,     // w21 w20 w17 w16
+  12873,4520,-22725,-12873,    // w23 w22 w19 w18
+  12873,-22725,4520,-12873,    // w29 w28 w25 w24
+  4520,19266,19266,-22725,     // w31 w30 w27 w26
+// %3 for rows 1,7 - constants are multiplied by cos_1_16
+  22725,29692,22725,12299,     // movq-> w05 w04 w01 w00
+  22725,12299,-22725,-29692,   // w07 w06 w03 w02
+  22725,-12299,22725,-29692,   // w13 w12 w09 w08
+  -22725,29692,22725,-12299,   // w15 w14 w11 w10
+  31521,26722,26722,-6270,     // w21 w20 w17 w16
+  17855,6270,-31521,-17855,    // w23 w22 w19 w18
+  17855,-31521,6270,-17855,    // w29 w28 w25 w24
+  6270,26722,26722,-31521,     // w31 w30 w27 w26
+// %3 for rows 2,6 - constants are multiplied by cos_2_16
+  21407,27969,21407,11585,     // movq-> w05 w04 w01 w00
+  21407,11585,-21407,-27969,   // w07 w06 w03 w02
+  21407,-11585,21407,-27969,   // w13 w12 w09 w08
+  -21407,27969,21407,-11585,   // w15 w14 w11 w10
+  29692,25172,25172,-5906,     // w21 w20 w17 w16
+  16819,5906,-29692,-16819,    // w23 w22 w19 w18
+  16819,-29692,5906,-16819,    // w29 w28 w25 w24
+  5906,25172,25172,-29692,     // w31 w30 w27 w26
+// %3 for rows 3,5 - constants are multiplied by cos_3_16
+  19266,25172,19266,10426,     // movq-> w05 w04 w01 w00
+  19266,10426,-19266,-25172,   // w07 w06 w03 w02
+  19266,-10426,19266,-25172,   // w13 w12 w09 w08
+  -19266,25172,19266,-10426,   // w15 w14 w11 w10
+  26722,22654,22654,-5315,     // w21 w20 w17 w16
+  15137,5315,-26722,-15137,    // w23 w22 w19 w18
+  15137,-26722,5315,-15137,    // w29 w28 w25 w24
+  5315,22654,22654,-26722,     // w31 w30 w27 w26
+};
+//=============================================================================
+// Helper macros for the code
+//=============================================================================
+
+//-----------------------------------------------------------------------------
+// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
+//-----------------------------------------------------------------------------
+
+#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\
+  "movq " #A1 ",%%mm0              \n\t"/* 0 ; x3 x2 x1 x0*/\
+  "movq 8+" #A1 ",%%mm1            \n\t"/* 1 ; x7 x6 x5 x4*/\
+  "movq %%mm0,%%mm2            \n\t"/* 2  ; x3 x2 x1 x0*/\
+  "movq " #A3 ",%%mm3              \n\t"/* 3 ; w06 w04 w02 w00*/\
+  "punpcklwd %%mm1,%%mm0       \n\t"/* x5 x1 x4 x0*/\
+  "movq %%mm0,%%mm5            \n\t"/* 5 ; x5 x1 x4 x0*/\
+  "punpckldq %%mm0,%%mm0       \n\t"/* x4 x0 x4 x0*/\
+  "movq 8+" #A3 ",%%mm4            \n\t"/* 4 ; w07 w05 w03 w01*/\
+  "punpckhwd %%mm1,%%mm2       \n\t"/* 1 ; x7 x3 x6 x2*/\
+  "pmaddwd %%mm0,%%mm3         \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\
+  "movq %%mm2,%%mm6            \n\t"/* 6 ; x7 x3 x6 x2*/\
+  "movq 32+" #A3 ",%%mm1           \n\t"/* 1 ; w22 w20 w18 w16*/\
+  "punpckldq %%mm2,%%mm2       \n\t"/* x6 x2 x6 x2*/\
+  "pmaddwd %%mm2,%%mm4         \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\
+  "punpckhdq %%mm5,%%mm5       \n\t"/* x5 x1 x5 x1*/\
+  "pmaddwd 16+" #A3 ",%%mm0        \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\
+  "punpckhdq %%mm6,%%mm6       \n\t"/* x7 x3 x7 x3*/\
+  "movq 40+" #A3 ",%%mm7           \n\t"/* 7 ; w23 w21 w19 w17*/\
+  "pmaddwd %%mm5,%%mm1         \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\
+  "paddd " #A4 ",%%mm3             \n\t"/* +%4*/\
+  "pmaddwd %%mm6,%%mm7         \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\
+  "pmaddwd 24+" #A3 ",%%mm2        \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\
+  "paddd %%mm4,%%mm3           \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
+  "pmaddwd 48+" #A3 ",%%mm5        \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\
+  "movq %%mm3,%%mm4            \n\t"/* 4 ; a1 a0*/\
+  "pmaddwd 56+" #A3 ",%%mm6        \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\
+  "paddd %%mm7,%%mm1           \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
+  "paddd " #A4 ",%%mm0             \n\t"/* +%4*/\
+  "psubd %%mm1,%%mm3           \n\t"/* a1-b1 a0-b0*/\
+  "psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\
+  "paddd %%mm4,%%mm1           \n\t"/* 4 ; a1+b1 a0+b0*/\
+  "paddd %%mm2,%%mm0           \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\
+  "psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\
+  "paddd %%mm6,%%mm5           \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\
+  "movq %%mm0,%%mm4            \n\t"/* 4 ; a3 a2*/\
+  "paddd %%mm5,%%mm0           \n\t"/* a3+b3 a2+b2*/\
+  "psubd %%mm5,%%mm4           \n\t"/* 5 ; a3-b3 a2-b2*/\
+  "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
+  "psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\
+  "packssdw %%mm0,%%mm1        \n\t"/* 0 ; y3 y2 y1 y0*/\
+  "packssdw %%mm3,%%mm4        \n\t"/* 3 ; y6 y7 y4 y5*/\
+  "movq %%mm4,%%mm7            \n\t"/* 7 ; y6 y7 y4 y5*/\
+  "psrld $16,%%mm4            \n\t"/* 0 y6 0 y4*/\
+  "pslld $16,%%mm7            \n\t"/* y7 0 y5 0*/\
+  "movq %%mm1," #A2 "              \n\t"/* 1 ; save y3 y2 y1 y0*/\
+  "por %%mm4,%%mm7             \n\t"/* 4 ; y7 y6 y5 y4*/\
+  "movq %%mm7,8            +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
+
+
+//-----------------------------------------------------------------------------
+// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
+//-----------------------------------------------------------------------------
+
+#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\
+  "movq " #A1 ",%%mm0                  \n\t"/* 0     ; x3 x2 x1 x0*/\
+  "movq 8+" #A1 ",%%mm1                \n\t"/* 1     ; x7 x6 x5 x4*/\
+  "movq %%mm0,%%mm2                \n\t"/* 2     ; x3 x2 x1 x0*/\
+  "movq " #A3 ",%%mm3                  \n\t"/* 3     ; w05 w04 w01 w00*/\
+  "pshufw $0x88,%%mm0,%%mm0        \n\t"/* x2 x0 x2 x0*/\
+  "movq 8+" #A3 ",%%mm4                \n\t"/* 4     ; w07 w06 w03 w02*/\
+  "movq %%mm1,%%mm5                \n\t"/* 5     ; x7 x6 x5 x4*/\
+  "pmaddwd %%mm0,%%mm3             \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\
+  "movq 32+" #A3 ",%%mm6               \n\t"/* 6     ; w21 w20 w17 w16*/\
+  "pshufw $0x88,%%mm1,%%mm1        \n\t"/* x6 x4 x6 x4*/\
+  "pmaddwd %%mm1,%%mm4             \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\
+  "movq 40+" #A3 ",%%mm7               \n\t"/* 7    ; w23 w22 w19 w18*/\
+  "pshufw $0xdd,%%mm2,%%mm2        \n\t"/* x3 x1 x3 x1*/\
+  "pmaddwd %%mm2,%%mm6             \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\
+  "pshufw $0xdd,%%mm5,%%mm5        \n\t"/* x7 x5 x7 x5*/\
+  "pmaddwd %%mm5,%%mm7             \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\
+  "paddd " #A4 ",%%mm3                 \n\t"/* +%4*/\
+  "pmaddwd 16+" #A3 ",%%mm0            \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\
+  "paddd %%mm4,%%mm3               \n\t"/* 4     ; a1=sum(even1) a0=sum(even0)*/\
+  "pmaddwd 24+" #A3 ",%%mm1            \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\
+  "movq %%mm3,%%mm4                \n\t"/* 4     ; a1 a0*/\
+  "pmaddwd 48+" #A3 ",%%mm2            \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\
+  "paddd %%mm7,%%mm6               \n\t"/* 7     ; b1=sum(odd1) b0=sum(odd0)*/\
+  "pmaddwd 56+" #A3 ",%%mm5            \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\
+  "paddd %%mm6,%%mm3               \n\t"/* a1+b1 a0+b0*/\
+  "paddd " #A4 ",%%mm0                 \n\t"/* +%4*/\
+  "psrad $11,%%mm3     \n\t"/* y1=a1+b1 y0=a0+b0*/\
+  "paddd %%mm1,%%mm0               \n\t"/* 1     ; a3=sum(even3) a2=sum(even2)*/\
+  "psubd %%mm6,%%mm4               \n\t"/* 6     ; a1-b1 a0-b0*/\
+  "movq %%mm0,%%mm7                \n\t"/* 7     ; a3 a2*/\
+  "paddd %%mm5,%%mm2               \n\t"/* 5     ; b3=sum(odd3) b2=sum(odd2)*/\
+  "paddd %%mm2,%%mm0               \n\t"/* a3+b3 a2+b2*/\
+  "psrad $11,%%mm4     \n\t"/* y6=a1-b1 y7=a0-b0*/\
+  "psubd %%mm2,%%mm7               \n\t"/* 2     ; a3-b3 a2-b2*/\
+  "psrad $11,%%mm0     \n\t"/* y3=a3+b3 y2=a2+b2*/\
+  "psrad $11,%%mm7     \n\t"/* y4=a3-b3 y5=a2-b2*/\
+  "packssdw %%mm0,%%mm3            \n\t"/* 0     ; y3 y2 y1 y0*/\
+  "packssdw %%mm4,%%mm7            \n\t"/* 4     ; y6 y7 y4 y5*/\
+  "movq %%mm3, " #A2 "                  \n\t"/* 3     ; save y3 y2 y1 y0*/\
+  "pshufw $0xb1,%%mm7,%%mm7        \n\t"/* y7 y6 y5 y4*/\
+  "movq %%mm7,8                +" #A2 "\n\t"/* 7     ; save y7 y6 y5 y4*/\
+
+
+//-----------------------------------------------------------------------------
+//
+// The first stage DCT 8x8 - forward DCTs of columns
+//
+// The %2puts are multiplied
+// for rows 0,4 - on cos_4_16,
+// for rows 1,7 - on cos_1_16,
+// for rows 2,6 - on cos_2_16,
+// for rows 3,5 - on cos_3_16
+// and are shifted to the left for rise of accuracy
+//
+//-----------------------------------------------------------------------------
+//
+// The 8-point scaled forward DCT algorithm (26a8m)
+//
+//-----------------------------------------------------------------------------
+//
+// #define DCT_8_FRW_COL(x, y)
+//{
+// short t0, t1, t2, t3, t4, t5, t6, t7;
+// short tp03, tm03, tp12, tm12, tp65, tm65;
+// short tp465, tm465, tp765, tm765;
+//
+// t0 = LEFT_SHIFT ( x[0] + x[7] );
+// t1 = LEFT_SHIFT ( x[1] + x[6] );
+// t2 = LEFT_SHIFT ( x[2] + x[5] );
+// t3 = LEFT_SHIFT ( x[3] + x[4] );
+// t4 = LEFT_SHIFT ( x[3] - x[4] );
+// t5 = LEFT_SHIFT ( x[2] - x[5] );
+// t6 = LEFT_SHIFT ( x[1] - x[6] );
+// t7 = LEFT_SHIFT ( x[0] - x[7] );
+//
+// tp03 = t0 + t3;
+// tm03 = t0 - t3;
+// tp12 = t1 + t2;
+// tm12 = t1 - t2;
+//
+// y[0] = tp03 + tp12;
+// y[4] = tp03 - tp12;
+//
+// y[2] = tm03 + tm12 * tg_2_16;
+// y[6] = tm03 * tg_2_16 - tm12;
+//
+// tp65 =(t6 +t5 )*cos_4_16;
+// tm65 =(t6 -t5 )*cos_4_16;
+//
+// tp765 = t7 + tp65;
+// tm765 = t7 - tp65;
+// tp465 = t4 + tm65;
+// tm465 = t4 - tm65;
+//
+// y[1] = tp765 + tp465 * tg_1_16;
+// y[7] = tp765 * tg_1_16 - tp465;
+// y[5] = tm765 * tg_3_16 + tm465;
+// y[3] = tm765 - tm465 * tg_3_16;
+//}
+//
+//-----------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------
+// DCT_8_INV_COL_4  INP,OUT
+//-----------------------------------------------------------------------------
+
+#define DCT_8_INV_COL(A1,A2)\
+  "movq 2*8(%3),%%mm0\n\t"\
+  "movq 16*3+" #A1 ",%%mm3\n\t"\
+  "movq %%mm0,%%mm1            \n\t"/* tg_3_16*/\
+  "movq 16*5+" #A1 ",%%mm5\n\t"\
+  "pmulhw %%mm3,%%mm0          \n\t"/* x3*(tg_3_16-1)*/\
+  "movq (%3),%%mm4\n\t"\
+  "pmulhw %%mm5,%%mm1          \n\t"/* x5*(tg_3_16-1)*/\
+  "movq 16*7+" #A1 ",%%mm7\n\t"\
+  "movq %%mm4,%%mm2            \n\t"/* tg_1_16*/\
+  "movq 16*1+" #A1 ",%%mm6\n\t"\
+  "pmulhw %%mm7,%%mm4          \n\t"/* x7*tg_1_16*/\
+  "paddsw %%mm3,%%mm0          \n\t"/* x3*tg_3_16*/\
+  "pmulhw %%mm6,%%mm2          \n\t"/* x1*tg_1_16*/\
+  "paddsw %%mm3,%%mm1          \n\t"/* x3+x5*(tg_3_16-1)*/\
+  "psubsw %%mm5,%%mm0          \n\t"/* x3*tg_3_16-x5 = tm35*/\
+  "movq 3*8(%3),%%mm3\n\t"\
+  "paddsw %%mm5,%%mm1          \n\t"/* x3+x5*tg_3_16 = tp35*/\
+  "paddsw %%mm6,%%mm4          \n\t"/* x1+tg_1_16*x7 = tp17*/\
+  "psubsw %%mm7,%%mm2          \n\t"/* x1*tg_1_16-x7 = tm17*/\
+  "movq %%mm4,%%mm5            \n\t"/* tp17*/\
+  "movq %%mm2,%%mm6            \n\t"/* tm17*/\
+  "paddsw %%mm1,%%mm5          \n\t"/* tp17+tp35 = b0*/\
+  "psubsw %%mm0,%%mm6          \n\t"/* tm17-tm35 = b3*/\
+  "psubsw %%mm1,%%mm4          \n\t"/* tp17-tp35 = t1*/\
+  "paddsw %%mm0,%%mm2          \n\t"/* tm17+tm35 = t2*/\
+  "movq 1*8(%3),%%mm7\n\t"\
+  "movq %%mm4,%%mm1            \n\t"/* t1*/\
+  "movq %%mm5,3*16         +" #A2 "\n\t"/* save b0*/\
+  "paddsw %%mm2,%%mm1          \n\t"/* t1+t2*/\
+  "movq %%mm6,5*16         +" #A2 "\n\t"/* save b3*/\
+  "psubsw %%mm2,%%mm4          \n\t"/* t1-t2*/\
+  "movq 2*16+" #A1 ",%%mm5\n\t"\
+  "movq %%mm7,%%mm0            \n\t"/* tg_2_16*/\
+  "movq 6*16+" #A1 ",%%mm6\n\t"\
+  "pmulhw %%mm5,%%mm0          \n\t"/* x2*tg_2_16*/\
+  "pmulhw %%mm6,%%mm7          \n\t"/* x6*tg_2_16*/\
+  "pmulhw %%mm3,%%mm1          \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\
+  "movq 0*16+" #A1 ",%%mm2\n\t"\
+  "pmulhw %%mm3,%%mm4          \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\
+  "psubsw %%mm6,%%mm0          \n\t"/* t2*tg_2_16-x6 = tm26*/\
+  "movq %%mm2,%%mm3            \n\t"/* x0*/\
+  "movq 4*16+" #A1 ",%%mm6\n\t"\
+  "paddsw %%mm5,%%mm7          \n\t"/* x2+x6*tg_2_16 = tp26*/\
+  "paddsw %%mm6,%%mm2          \n\t"/* x0+x4 = tp04*/\
+  "psubsw %%mm6,%%mm3          \n\t"/* x0-x4 = tm04*/\
+  "movq %%mm2,%%mm5            \n\t"/* tp04*/\
+  "movq %%mm3,%%mm6            \n\t"/* tm04*/\
+  "psubsw %%mm7,%%mm2          \n\t"/* tp04-tp26 = a3*/\
+  "paddsw %%mm0,%%mm3          \n\t"/* tm04+tm26 = a1*/\
+  "paddsw %%mm1,%%mm1          \n\t"/* b1*/\
+  "paddsw %%mm4,%%mm4          \n\t"/* b2*/\
+  "paddsw %%mm7,%%mm5          \n\t"/* tp04+tp26 = a0*/\
+  "psubsw %%mm0,%%mm6          \n\t"/* tm04-tm26 = a2*/\
+  "movq %%mm3,%%mm7            \n\t"/* a1*/\
+  "movq %%mm6,%%mm0            \n\t"/* a2*/\
+  "paddsw %%mm1,%%mm3          \n\t"/* a1+b1*/\
+  "paddsw %%mm4,%%mm6          \n\t"/* a2+b2*/\
+  "psraw $6,%%mm3 \n\t"/* dst1*/\
+  "psubsw %%mm1,%%mm7          \n\t"/* a1-b1*/\
+  "psraw $6,%%mm6 \n\t"/* dst2*/\
+  "psubsw %%mm4,%%mm0          \n\t"/* a2-b2*/\
+  "movq 3*16+" #A2 ",%%mm1         \n\t"/* load b0*/\
+  "psraw $6,%%mm7 \n\t"/* dst6*/\
+  "movq %%mm5,%%mm4            \n\t"/* a0*/\
+  "psraw $6,%%mm0 \n\t"/* dst5*/\
+  "movq %%mm3,1*16+" #A2 "\n\t"\
+  "paddsw %%mm1,%%mm5          \n\t"/* a0+b0*/\
+  "movq %%mm6,2*16+" #A2 "\n\t"\
+  "psubsw %%mm1,%%mm4          \n\t"/* a0-b0*/\
+  "movq 5*16+" #A2 ",%%mm3         \n\t"/* load b3*/\
+  "psraw $6,%%mm5 \n\t"/* dst0*/\
+  "movq %%mm2,%%mm6            \n\t"/* a3*/\
+  "psraw $6,%%mm4 \n\t"/* dst7*/\
+  "movq %%mm0,5*16+" #A2 "\n\t"\
+  "paddsw %%mm3,%%mm2          \n\t"/* a3+b3*/\
+  "movq %%mm7,6*16+" #A2 "\n\t"\
+  "psubsw %%mm3,%%mm6          \n\t"/* a3-b3*/\
+  "movq %%mm5,0*16+" #A2 "\n\t"\
+  "psraw $6,%%mm2 \n\t"/* dst3*/\
+  "movq %%mm4,7*16+" #A2 "\n\t"\
+  "psraw $6,%%mm6 \n\t"/* dst4*/\
+  "movq %%mm2,3*16+" #A2 "\n\t"\
+  "movq %%mm6,4*16+" #A2 "\n\t"
+
+//=============================================================================
+// Code
+//=============================================================================
+
+//-----------------------------------------------------------------------------
+// void idct_mmx(uint16_t block[64]);
+//-----------------------------------------------------------------------------
+
+
+void ff_xvid_idct_mmx(short *block)
+{
+__asm__ volatile(
+            //# Process each row
+    DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
+    DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
+    DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
+    DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
+    DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
+    DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
+    DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
+    DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
+
+            //# Process the columns (4 at a time)
+    DCT_8_INV_COL(0(%0), 0(%0))
+    DCT_8_INV_COL(8(%0), 8(%0))
+    :: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16));
+}
+
+void ff_xvid_idct_mmx_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_xvid_idct_mmx(block);
+    ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_xvid_idct_mmx_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_xvid_idct_mmx(block);
+    ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+#endif /* HAVE_MMX_INLINE */
+
+#if HAVE_MMXEXT_INLINE
+
+//-----------------------------------------------------------------------------
+// void idct_xmm(uint16_t block[64]);
+//-----------------------------------------------------------------------------
+
+
+void ff_xvid_idct_mmxext(short *block)
+{
+__asm__ volatile(
+            //# Process each row
+    DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
+    DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
+    DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
+    DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
+    DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
+    DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
+    DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
+    DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
+
+            //# Process the columns (4 at a time)
+    DCT_8_INV_COL(0(%0), 0(%0))
+    DCT_8_INV_COL(8(%0), 8(%0))
+    :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
+}
+
+void ff_xvid_idct_mmxext_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_xvid_idct_mmxext(block);
+    ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_xvid_idct_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_xvid_idct_mmxext(block);
+    ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+#endif /* HAVE_MMXEXT_INLINE */
diff --git a/libavcodec/x86/xvididct_sse2.c b/libavcodec/x86/xvididct_sse2.c
new file mode 100644
index 0000000000..9a87df00b8
--- /dev/null
+++ b/libavcodec/x86/xvididct_sse2.c
@@ -0,0 +1,406 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ * - SSE2 inverse discrete cosine transform -
+ *
+ * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
+ *
+ * Conversion to gcc syntax with modifications
+ * by Alexander Strange <astrange@ithinksw.com>
+ *
+ * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+ *
+ * This file is part of Libav.
+ *
+ * Vertical pass is an implementation of the scheme:
+ *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
+ *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+ *  Proc. ICASSP 1989, 988-991.
+ *
+ * Horizontal pass is a double 4x4 vector/matrix multiplication,
+ * (see also Intel's Application Note 922:
+ *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+ *  Copyright (C) 1999 Intel Corporation)
+ *
+ * More details at http://skal.planet-d.net/coding/dct.html
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Libav; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/internal.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+
+#include "idctdsp.h"
+#include "xvididct.h"
+
+#if HAVE_SSE2_INLINE
+
+/**
+ * @file
+ * @brief SSE2 IDCT compatible with the Xvid IDCT
+ */
+
+#define X8(x)     x,x,x,x,x,x,x,x
+
+DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16)
+DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
+DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1
+DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2)
+DECLARE_ASM_CONST(8,  uint8_t, m127)[] = {X8(127)};
+
+DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
+ 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
+ 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
+ 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
+ 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
+ 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
+ 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
+ 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
+ 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
+ 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
+ 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
+ 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
+ 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
+ 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
+ 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
+ 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
+ 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
+};
+
+DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
+ 65536, 65536, 65536, 65536,
+  3597,  3597,  3597,  3597,
+  2260,  2260,  2260,  2260,
+  1203,  1203,  1203,  1203,
+   120,   120,   120,   120,
+   512,   512,   512,   512
+};
+
+// Temporary storage before the column pass
+#define ROW1 "%%xmm6"
+#define ROW3 "%%xmm4"
+#define ROW5 "%%xmm5"
+#define ROW7 "%%xmm7"
+
+#define CLEAR_ODD(r) "pxor  "r","r" \n\t"
+#define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
+
+#if ARCH_X86_64
+
+# define ROW0 "%%xmm8"
+# define REG0 ROW0
+# define ROW2 "%%xmm9"
+# define REG2 ROW2
+# define ROW4 "%%xmm10"
+# define REG4 ROW4
+# define ROW6 "%%xmm11"
+# define REG6 ROW6
+# define CLEAR_EVEN(r) CLEAR_ODD(r)
+# define PUT_EVEN(dst) PUT_ODD(dst)
+# define XMMS "%%xmm12"
+# define MOV_32_ONLY "#"
+# define SREG2 REG2
+# define TAN3 "%%xmm13"
+# define TAN1 "%%xmm14"
+
+#else
+
+# define ROW0 "(%0)"
+# define REG0 "%%xmm4"
+# define ROW2 "2*16(%0)"
+# define REG2 "%%xmm4"
+# define ROW4 "4*16(%0)"
+# define REG4 "%%xmm6"
+# define ROW6 "6*16(%0)"
+# define REG6 "%%xmm6"
+# define CLEAR_EVEN(r)
+# define PUT_EVEN(dst) \
+    "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
+    "movdqa          %%xmm2, "dst"    \n\t"
+# define XMMS "%%xmm2"
+# define MOV_32_ONLY "movdqa "
+# define SREG2 "%%xmm7"
+# define TAN3 "%%xmm0"
+# define TAN1 "%%xmm2"
+
+#endif
+
+#define ROUND(x) "paddd   "MANGLE(x)
+
+#define JZ(reg, to)                         \
+    "testl     "reg","reg"            \n\t" \
+    "jz        "to"                   \n\t"
+
+#define JNZ(reg, to)                        \
+    "testl     "reg","reg"            \n\t" \
+    "jnz       "to"                   \n\t"
+
+#define TEST_ONE_ROW(src, reg, clear)       \
+    clear                                   \
+    "movq     "src", %%mm1            \n\t" \
+    "por    8+"src", %%mm1            \n\t" \
+    "paddusb  %%mm0, %%mm1            \n\t" \
+    "pmovmskb %%mm1, "reg"            \n\t"
+
+#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
+    clear1                                  \
+    clear2                                  \
+    "movq     "row1", %%mm1           \n\t" \
+    "por    8+"row1", %%mm1           \n\t" \
+    "movq     "row2", %%mm2           \n\t" \
+    "por    8+"row2", %%mm2           \n\t" \
+    "paddusb   %%mm0, %%mm1           \n\t" \
+    "paddusb   %%mm0, %%mm2           \n\t" \
+    "pmovmskb  %%mm1, "reg1"          \n\t" \
+    "pmovmskb  %%mm2, "reg2"          \n\t"
+
+///IDCT pass on rows.
+#define iMTX_MULT(src, table, rounder, put) \
+    "movdqa        "src", %%xmm3      \n\t" \
+    "movdqa       %%xmm3, %%xmm0      \n\t" \
+    "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
+    "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
+    "pmaddwd     "table", %%xmm0      \n\t" \
+    "pmaddwd  16+"table", %%xmm1      \n\t" \
+    "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
+    "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
+    "pmaddwd  32+"table", %%xmm2      \n\t" \
+    "pmaddwd  48+"table", %%xmm3      \n\t" \
+    "paddd        %%xmm1, %%xmm0      \n\t" \
+    "paddd        %%xmm3, %%xmm2      \n\t" \
+    rounder",     %%xmm0              \n\t" \
+    "movdqa       %%xmm2, %%xmm3      \n\t" \
+    "paddd        %%xmm0, %%xmm2      \n\t" \
+    "psubd        %%xmm3, %%xmm0      \n\t" \
+    "psrad           $11, %%xmm2      \n\t" \
+    "psrad           $11, %%xmm0      \n\t" \
+    "packssdw     %%xmm0, %%xmm2      \n\t" \
+    put                                     \
+    "1:                               \n\t"
+
+#define iLLM_HEAD                           \
+    "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
+    "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
+
+///IDCT pass on columns.
+#define iLLM_PASS(dct)                      \
+    "movdqa   "TAN3", %%xmm1          \n\t" \
+    "movdqa   "TAN1", %%xmm3          \n\t" \
+    "pmulhw   %%xmm4, "TAN3"          \n\t" \
+    "pmulhw   %%xmm5, %%xmm1          \n\t" \
+    "paddsw   %%xmm4, "TAN3"          \n\t" \
+    "paddsw   %%xmm5, %%xmm1          \n\t" \
+    "psubsw   %%xmm5, "TAN3"          \n\t" \
+    "paddsw   %%xmm4, %%xmm1          \n\t" \
+    "pmulhw   %%xmm7, %%xmm3          \n\t" \
+    "pmulhw   %%xmm6, "TAN1"          \n\t" \
+    "paddsw   %%xmm6, %%xmm3          \n\t" \
+    "psubsw   %%xmm7, "TAN1"          \n\t" \
+    "movdqa   %%xmm3, %%xmm7          \n\t" \
+    "movdqa   "TAN1", %%xmm6          \n\t" \
+    "psubsw   %%xmm1, %%xmm3          \n\t" \
+    "psubsw   "TAN3", "TAN1"          \n\t" \
+    "paddsw   %%xmm7, %%xmm1          \n\t" \
+    "paddsw   %%xmm6, "TAN3"          \n\t" \
+    "movdqa   %%xmm3, %%xmm6          \n\t" \
+    "psubsw   "TAN3", %%xmm3          \n\t" \
+    "paddsw   %%xmm6, "TAN3"          \n\t" \
+    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
+    "pmulhw   %%xmm4, %%xmm3          \n\t" \
+    "pmulhw   %%xmm4, "TAN3"          \n\t" \
+    "paddsw   "TAN3", "TAN3"          \n\t" \
+    "paddsw   %%xmm3, %%xmm3          \n\t" \
+    "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
+    MOV_32_ONLY ROW2", "REG2"         \n\t" \
+    MOV_32_ONLY ROW6", "REG6"         \n\t" \
+    "movdqa   %%xmm7, %%xmm5          \n\t" \
+    "pmulhw   "REG6", %%xmm7          \n\t" \
+    "pmulhw   "REG2", %%xmm5          \n\t" \
+    "paddsw   "REG2", %%xmm7          \n\t" \
+    "psubsw   "REG6", %%xmm5          \n\t" \
+    MOV_32_ONLY ROW0", "REG0"         \n\t" \
+    MOV_32_ONLY ROW4", "REG4"         \n\t" \
+    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
+    "movdqa   "REG0", "XMMS"          \n\t" \
+    "psubsw   "REG4", "REG0"          \n\t" \
+    "paddsw   "XMMS", "REG4"          \n\t" \
+    "movdqa   "REG4", "XMMS"          \n\t" \
+    "psubsw   %%xmm7, "REG4"          \n\t" \
+    "paddsw   "XMMS", %%xmm7          \n\t" \
+    "movdqa   "REG0", "XMMS"          \n\t" \
+    "psubsw   %%xmm5, "REG0"          \n\t" \
+    "paddsw   "XMMS", %%xmm5          \n\t" \
+    "movdqa   %%xmm5, "XMMS"          \n\t" \
+    "psubsw   "TAN3", %%xmm5          \n\t" \
+    "paddsw   "XMMS", "TAN3"          \n\t" \
+    "movdqa   "REG0", "XMMS"          \n\t" \
+    "psubsw   %%xmm3, "REG0"          \n\t" \
+    "paddsw   "XMMS", %%xmm3          \n\t" \
+    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
+    "psraw        $6, %%xmm5          \n\t" \
+    "psraw        $6, "REG0"          \n\t" \
+    "psraw        $6, "TAN3"          \n\t" \
+    "psraw        $6, %%xmm3          \n\t" \
+    "movdqa   "TAN3", 1*16("dct")     \n\t" \
+    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
+    "movdqa   "REG0", 5*16("dct")     \n\t" \
+    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
+    "movdqa   %%xmm7, %%xmm0          \n\t" \
+    "movdqa   "REG4", %%xmm4          \n\t" \
+    "psubsw   %%xmm1, %%xmm7          \n\t" \
+    "psubsw   "TAN1", "REG4"          \n\t" \
+    "paddsw   %%xmm0, %%xmm1          \n\t" \
+    "paddsw   %%xmm4, "TAN1"          \n\t" \
+    "psraw        $6, %%xmm1          \n\t" \
+    "psraw        $6, %%xmm7          \n\t" \
+    "psraw        $6, "TAN1"          \n\t" \
+    "psraw        $6, "REG4"          \n\t" \
+    "movdqa   %%xmm1, ("dct")         \n\t" \
+    "movdqa   "TAN1", 3*16("dct")     \n\t" \
+    "movdqa   "REG4", 4*16("dct")     \n\t" \
+    "movdqa   %%xmm7, 7*16("dct")     \n\t"
+
+///IDCT pass on columns, assuming rows 4-7 are zero.
+#define iLLM_PASS_SPARSE(dct)               \
+    "pmulhw   %%xmm4, "TAN3"          \n\t" \
+    "paddsw   %%xmm4, "TAN3"          \n\t" \
+    "movdqa   %%xmm6, %%xmm3          \n\t" \
+    "pmulhw   %%xmm6, "TAN1"          \n\t" \
+    "movdqa   %%xmm4, %%xmm1          \n\t" \
+    "psubsw   %%xmm1, %%xmm3          \n\t" \
+    "paddsw   %%xmm6, %%xmm1          \n\t" \
+    "movdqa   "TAN1", %%xmm6          \n\t" \
+    "psubsw   "TAN3", "TAN1"          \n\t" \
+    "paddsw   %%xmm6, "TAN3"          \n\t" \
+    "movdqa   %%xmm3, %%xmm6          \n\t" \
+    "psubsw   "TAN3", %%xmm3          \n\t" \
+    "paddsw   %%xmm6, "TAN3"          \n\t" \
+    "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
+    "pmulhw   %%xmm4, %%xmm3          \n\t" \
+    "pmulhw   %%xmm4, "TAN3"          \n\t" \
+    "paddsw   "TAN3", "TAN3"          \n\t" \
+    "paddsw   %%xmm3, %%xmm3          \n\t" \
+    "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
+    MOV_32_ONLY ROW2", "SREG2"        \n\t" \
+    "pmulhw   "SREG2", %%xmm5         \n\t" \
+    MOV_32_ONLY ROW0", "REG0"         \n\t" \
+    "movdqa   "REG0", %%xmm6          \n\t" \
+    "psubsw   "SREG2", %%xmm6         \n\t" \
+    "paddsw   "REG0", "SREG2"         \n\t" \
+    MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
+    "movdqa   "REG0", "XMMS"          \n\t" \
+    "psubsw   %%xmm5, "REG0"          \n\t" \
+    "paddsw   "XMMS", %%xmm5          \n\t" \
+    "movdqa   %%xmm5, "XMMS"          \n\t" \
+    "psubsw   "TAN3", %%xmm5          \n\t" \
+    "paddsw   "XMMS", "TAN3"          \n\t" \
+    "movdqa   "REG0", "XMMS"          \n\t" \
+    "psubsw   %%xmm3, "REG0"          \n\t" \
+    "paddsw   "XMMS", %%xmm3          \n\t" \
+    MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
+    "psraw        $6, %%xmm5          \n\t" \
+    "psraw        $6, "REG0"          \n\t" \
+    "psraw        $6, "TAN3"          \n\t" \
+    "psraw        $6, %%xmm3          \n\t" \
+    "movdqa   "TAN3", 1*16("dct")     \n\t" \
+    "movdqa   %%xmm3, 2*16("dct")     \n\t" \
+    "movdqa   "REG0", 5*16("dct")     \n\t" \
+    "movdqa   %%xmm5, 6*16("dct")     \n\t" \
+    "movdqa   "SREG2", %%xmm0         \n\t" \
+    "movdqa   %%xmm6, %%xmm4          \n\t" \
+    "psubsw   %%xmm1, "SREG2"         \n\t" \
+    "psubsw   "TAN1", %%xmm6          \n\t" \
+    "paddsw   %%xmm0, %%xmm1          \n\t" \
+    "paddsw   %%xmm4, "TAN1"          \n\t" \
+    "psraw        $6, %%xmm1          \n\t" \
+    "psraw        $6, "SREG2"         \n\t" \
+    "psraw        $6, "TAN1"          \n\t" \
+    "psraw        $6, %%xmm6          \n\t" \
+    "movdqa   %%xmm1, ("dct")         \n\t" \
+    "movdqa   "TAN1", 3*16("dct")     \n\t" \
+    "movdqa   %%xmm6, 4*16("dct")     \n\t" \
+    "movdqa   "SREG2", 7*16("dct")    \n\t"
+
+inline void ff_xvid_idct_sse2(short *block)
+{
+    __asm__ volatile(
+    "movq     "MANGLE(m127)", %%mm0                              \n\t"
+    iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),      PUT_EVEN(ROW0))
+    iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
+    iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
+
+    TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
+    JZ("%%eax", "1f")
+    iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
+
+    TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
+    TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
+    iLLM_HEAD
+    ".p2align 4 \n\t"
+    JNZ("%%ecx", "2f")
+    JNZ("%%eax", "3f")
+    JNZ("%%edx", "4f")
+    JNZ("%%esi", "5f")
+    iLLM_PASS_SPARSE("%0")
+    "jmp 6f                                                      \n\t"
+    "2:                                                          \n\t"
+    iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
+    "3:                                                          \n\t"
+    iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
+    JZ("%%edx", "1f")
+    "4:                                                          \n\t"
+    iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
+    JZ("%%esi", "1f")
+    "5:                                                          \n\t"
+    iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
+#if ARCH_X86_32
+    iLLM_HEAD
+#endif
+    iLLM_PASS("%0")
+    "6:                                                          \n\t"
+    : "+r"(block)
+    :
+    : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
+                   "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
+#if ARCH_X86_64
+      XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
+                   "%xmm12", "%xmm13", "%xmm14",)
+#endif
+      "%eax", "%ecx", "%edx", "%esi", "memory"
+    );
+}
+
+void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block)
+{
+    ff_xvid_idct_sse2(block);
+    ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
+{
+    ff_xvid_idct_sse2(block);
+    ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+#endif /* HAVE_SSE2_INLINE */
-- 
cgit v1.2.3