19 files changed, 6279 insertions, 0 deletions
diff --git a/libavfilter/libmpcodecs/av_helpers.h b/libavfilter/libmpcodecs/av_helpers.h
new file mode 100644
index 0000000000..90b67d5a0f
--- /dev/null
+++ b/libavfilter/libmpcodecs/av_helpers.h
@@ -0,0 +1,27 @@
+/*
+ * Generic libav* helpers
+ *
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef MPLAYER_AV_HELPERS_H
+#define MPLAYER_AV_HELPERS_H
+
+void ff_init_avcodec(void);
+void ff_init_avformat(void);
+
+#endif /* MPLAYER_AV_HELPERS_H */
diff --git a/libavfilter/libmpcodecs/cpudetect.h b/libavfilter/libmpcodecs/cpudetect.h
new file mode 100644
index 0000000000..710f6e6513
--- /dev/null
+++ b/libavfilter/libmpcodecs/cpudetect.h
@@ -0,0 +1,60 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef MPLAYER_CPUDETECT_H
+#define MPLAYER_CPUDETECT_H
+
+#define CPUTYPE_I386    3
+#define CPUTYPE_I486    4
+#define CPUTYPE_I586    5
+#define CPUTYPE_I686    6
+
+#include "libavutil/x86_cpu.h"
+
+typedef struct cpucaps_s {
+    int cpuType;
+    int cpuModel;
+    int cpuStepping;
+    int hasMMX;
+    int hasMMX2;
+    int has3DNow;
+    int has3DNowExt;
+    int hasSSE;
+    int hasSSE2;
+    int hasSSE3;
+    int hasSSSE3;
+    int hasSSE4;
+    int hasSSE42;
+    int hasSSE4a;
+    int hasAVX;
+    int isX86;
+    unsigned cl_size; /* size of cache line */
+    int hasAltiVec;
+    int hasTSC;
+} CpuCaps;
+
+extern CpuCaps ff_gCpuCaps;
+
+void ff_do_cpuid(unsigned int ax, unsigned int *p);
+
+void ff_GetCpuCaps(CpuCaps *caps);
+
+/* returned value is malloc()'ed so free() it after use */
+char *ff_GetCpuFriendlyName(unsigned int regs[], unsigned int regs2[]);
+
+#endif /* MPLAYER_CPUDETECT_H */
diff --git a/libavfilter/libmpcodecs/img_format.c b/libavfilter/libmpcodecs/img_format.c
new file mode 100644
index 0000000000..dd07f00a0a
--- /dev/null
+++ b/libavfilter/libmpcodecs/img_format.c
@@ -0,0 +1,244 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "config.h"
+#include "img_format.h"
+#include "stdio.h"
+#include "libavutil/bswap.h"
+
+const char *ff_vo_format_name(int format)
+{
+    static char unknown_format[20];
+    switch(format)
+    {
+    case IMGFMT_RGB1:    return "RGB 1-bit";
+    case IMGFMT_RGB4:    return "RGB 4-bit";
+    case IMGFMT_RG4B:    return "RGB 4-bit per byte";
+    case IMGFMT_RGB8:    return "RGB 8-bit";
+    case IMGFMT_RGB12:   return "RGB 12-bit";
+    case IMGFMT_RGB15:   return "RGB 15-bit";
+    case IMGFMT_RGB16:   return "RGB 16-bit";
+    case IMGFMT_RGB24:   return "RGB 24-bit";
+//  case IMGFMT_RGB32:   return "RGB 32-bit";
+    case IMGFMT_RGB48LE: return "RGB 48-bit LE";
+    case IMGFMT_RGB48BE: return "RGB 48-bit BE";
+    case IMGFMT_RGB64LE: return "RGB 64-bit LE";
+    case IMGFMT_RGB64BE: return "RGB 64-bit BE";
+    case IMGFMT_BGR1:    return "BGR 1-bit";
+    case IMGFMT_BGR4:    return "BGR 4-bit";
+    case IMGFMT_BG4B:    return "BGR 4-bit per byte";
+    case IMGFMT_BGR8:    return "BGR 8-bit";
+    case IMGFMT_BGR12:   return "BGR 12-bit";
+    case IMGFMT_BGR15:   return "BGR 15-bit";
+    case IMGFMT_BGR16:   return "BGR 16-bit";
+    case IMGFMT_BGR24:   return "BGR 24-bit";
+//  case IMGFMT_BGR32:   return "BGR 32-bit";
+    case IMGFMT_ABGR:    return "ABGR";
+    case IMGFMT_BGRA:    return "BGRA";
+    case IMGFMT_ARGB:    return "ARGB";
+    case IMGFMT_RGBA:    return "RGBA";
+    case IMGFMT_XYZ12LE: return "XYZ 36-bit LE";
+    case IMGFMT_XYZ12BE: return "XYZ 36-bit BE";
+    case IMGFMT_GBR24P:  return "Planar GBR 24-bit";
+    case IMGFMT_GBR12P:  return "Planar GBR 36-bit";
+    case IMGFMT_GBR14P:  return "Planar GBR 42-bit";
+    case IMGFMT_YVU9:    return "Planar YVU9";
+    case IMGFMT_IF09:    return "Planar IF09";
+    case IMGFMT_YV12:    return "Planar YV12";
+    case IMGFMT_I420:    return "Planar I420";
+    case IMGFMT_IYUV:    return "Planar IYUV";
+    case IMGFMT_CLPL:    return "Planar CLPL";
+    case IMGFMT_Y800:    return "Planar Y800";
+    case IMGFMT_Y8:      return "Planar Y8";
+    case IMGFMT_Y8A:     return "Planar Y8 with alpha";
+    case IMGFMT_Y16_LE:  return "Planar Y16 little-endian";
+    case IMGFMT_Y16_BE:  return "Planar Y16 big-endian";
+    case IMGFMT_420P16_LE: return "Planar 420P 16-bit little-endian";
+    case IMGFMT_420P16_BE: return "Planar 420P 16-bit big-endian";
+    case IMGFMT_420P14_LE: return "Planar 420P 14-bit little-endian";
+    case IMGFMT_420P14_BE: return "Planar 420P 14-bit big-endian";
+    case IMGFMT_420P12_LE: return "Planar 420P 12-bit little-endian";
+    case IMGFMT_420P12_BE: return "Planar 420P 12-bit big-endian";
+    case IMGFMT_420P10_LE: return "Planar 420P 10-bit little-endian";
+    case IMGFMT_420P10_BE: return "Planar 420P 10-bit big-endian";
+    case IMGFMT_420P9_LE:  return "Planar 420P 9-bit little-endian";
+    case IMGFMT_420P9_BE:  return "Planar 420P 9-bit big-endian";
+    case IMGFMT_422P16_LE: return "Planar 422P 16-bit little-endian";
+    case IMGFMT_422P16_BE: return "Planar 422P 16-bit big-endian";
+    case IMGFMT_422P14_LE: return "Planar 422P 14-bit little-endian";
+    case IMGFMT_422P14_BE: return "Planar 422P 14-bit big-endian";
+    case IMGFMT_422P12_LE: return "Planar 422P 12-bit little-endian";
+    case IMGFMT_422P12_BE: return "Planar 422P 12-bit big-endian";
+    case IMGFMT_422P10_LE: return "Planar 422P 10-bit little-endian";
+    case IMGFMT_422P10_BE: return "Planar 422P 10-bit big-endian";
+    case IMGFMT_422P9_LE:  return "Planar 422P 9-bit little-endian";
+    case IMGFMT_422P9_BE:  return "Planar 422P 9-bit big-endian";
+    case IMGFMT_444P16_LE: return "Planar 444P 16-bit little-endian";
+    case IMGFMT_444P16_BE: return "Planar 444P 16-bit big-endian";
+    case IMGFMT_444P14_LE: return "Planar 444P 14-bit little-endian";
+    case IMGFMT_444P14_BE: return "Planar 444P 14-bit big-endian";
+    case IMGFMT_444P12_LE: return "Planar 444P 12-bit little-endian";
+    case IMGFMT_444P12_BE: return "Planar 444P 12-bit big-endian";
+    case IMGFMT_444P10_LE: return "Planar 444P 10-bit little-endian";
+    case IMGFMT_444P10_BE: return "Planar 444P 10-bit big-endian";
+    case IMGFMT_444P9_LE:  return "Planar 444P 9-bit little-endian";
+    case IMGFMT_444P9_BE:  return "Planar 444P 9-bit big-endian";
+    case IMGFMT_420A: return "Planar 420P with alpha";
+    case IMGFMT_444P: return "Planar 444P";
+    case IMGFMT_444A: return "Planar 444P with alpha";
+    case IMGFMT_422P: return "Planar 422P";
+    case IMGFMT_422A: return "Planar 422P with alpha";
+    case IMGFMT_411P: return "Planar 411P";
+    case IMGFMT_440P: return "Planar 440P";
+    case IMGFMT_NV12: return "Planar NV12";
+    case IMGFMT_NV21: return "Planar NV21";
+    case IMGFMT_HM12: return "Planar NV12 Macroblock";
+    case IMGFMT_IUYV: return "Packed IUYV";
+    case IMGFMT_IY41: return "Packed IY41";
+    case IMGFMT_IYU1: return "Packed IYU1";
+    case IMGFMT_IYU2: return "Packed IYU2";
+    case IMGFMT_UYVY: return "Packed UYVY";
+    case IMGFMT_UYNV: return "Packed UYNV";
+    case IMGFMT_cyuv: return "Packed CYUV";
+    case IMGFMT_Y422: return "Packed Y422";
+    case IMGFMT_YUY2: return "Packed YUY2";
+    case IMGFMT_YUNV: return "Packed YUNV";
+    case IMGFMT_YVYU: return "Packed YVYU";
+    case IMGFMT_Y41P: return "Packed Y41P";
+    case IMGFMT_Y211: return "Packed Y211";
+    case IMGFMT_Y41T: return "Packed Y41T";
+    case IMGFMT_Y42T: return "Packed Y42T";
+    case IMGFMT_V422: return "Packed V422";
+    case IMGFMT_V655: return "Packed V655";
+    case IMGFMT_CLJR: return "Packed CLJR";
+    case IMGFMT_YUVP: return "Packed YUVP";
+    case IMGFMT_UYVP: return "Packed UYVP";
+    case IMGFMT_MPEGPES:         return "Mpeg PES";
+    case IMGFMT_ZRMJPEGNI:       return "Zoran MJPEG non-interlaced";
+    case IMGFMT_ZRMJPEGIT:       return "Zoran MJPEG top field first";
+    case IMGFMT_ZRMJPEGIB:       return "Zoran MJPEG bottom field first";
+    case IMGFMT_XVMC_MOCO_MPEG2: return "MPEG1/2 Motion Compensation";
+    case IMGFMT_XVMC_IDCT_MPEG2: return "MPEG1/2 Motion Compensation and IDCT";
+    case IMGFMT_VDPAU_MPEG1:     return "MPEG1 VDPAU acceleration";
+    case IMGFMT_VDPAU_MPEG2:     return "MPEG2 VDPAU acceleration";
+    case IMGFMT_VDPAU_H264:      return "H.264 VDPAU acceleration";
+    case IMGFMT_VDPAU_MPEG4:     return "MPEG-4 Part 2 VDPAU acceleration";
+    case IMGFMT_VDPAU_WMV3:      return "WMV3 VDPAU acceleration";
+    case IMGFMT_VDPAU_VC1:       return "VC1 VDPAU acceleration";
+    }
+    snprintf(unknown_format,20,"Unknown 0x%04x",format);
+    return unknown_format;
+}
+
+int ff_mp_get_chroma_shift(int format, int *x_shift, int *y_shift, int *component_bits)
+{
+    int xs = 0, ys = 0;
+    int bpp;
+    int err = 0;
+    int bits = 8;
+    if ((format & 0xff0000f0) == 0x34000050)
+        format = av_bswap32(format);
+    if ((format & 0xf00000ff) == 0x50000034) {
+        switch (format >> 24) {
+        case 0x50:
+            break;
+        case 0x51:
+            bits = 16;
+            break;
+        case 0x52:
+            bits = 10;
+            break;
+        case 0x53:
+            bits = 9;
+            break;
+        default:
+            err = 1;
+            break;
+        }
+        switch (format & 0x00ffffff) {
+        case 0x00343434: // 444
+            xs = 0;
+            ys = 0;
+            break;
+        case 0x00323234: // 422
+            xs = 1;
+            ys = 0;
+            break;
+        case 0x00303234: // 420
+            xs = 1;
+            ys = 1;
+            break;
+        case 0x00313134: // 411
+            xs = 2;
+            ys = 0;
+            break;
+        case 0x00303434: // 440
+            xs = 0;
+            ys = 1;
+            break;
+        default:
+            err = 1;
+            break;
+        }
+    } else switch (format) {
+    case IMGFMT_444A:
+        xs = 0;
+        ys = 0;
+        break;
+    case IMGFMT_422A:
+        xs = 1;
+        ys = 0;
+        break;
+    case IMGFMT_420A:
+    case IMGFMT_I420:
+    case IMGFMT_IYUV:
+    case IMGFMT_YV12:
+        xs = 1;
+        ys = 1;
+        break;
+    case IMGFMT_IF09:
+    case IMGFMT_YVU9:
+        xs = 2;
+        ys = 2;
+        break;
+    case IMGFMT_Y8:
+    case IMGFMT_Y800:
+        xs = 31;
+        ys = 31;
+        break;
+    case IMGFMT_NV12:
+    case IMGFMT_NV21:
+        xs = 1;
+        ys = 1;
+        // TODO: allowing this though currently breaks
+        // things all over the place.
+        err = 1;
+        break;
+    default:
+        err = 1;
+        break;
+    }
+    if (x_shift) *x_shift = xs;
+    if (y_shift) *y_shift = ys;
+    if (component_bits) *component_bits = bits;
+    bpp = 8 + ((16 >> xs) >> ys);
+    if (format == IMGFMT_420A || format == IMGFMT_422A || format == IMGFMT_444A)
+        bpp += 8;
+    bpp *= (bits + 7) >> 3;
+    return err ? 0 : bpp;
+}
diff --git a/libavfilter/libmpcodecs/img_format.h b/libavfilter/libmpcodecs/img_format.h
new file mode 100644
index 0000000000..b5c0b9007f
--- /dev/null
+++ b/libavfilter/libmpcodecs/img_format.h
@@ -0,0 +1,309 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef MPLAYER_IMG_FORMAT_H
+#define MPLAYER_IMG_FORMAT_H
+
+#include "config.h"
+
+/* RGB/BGR Formats */
+
+#define IMGFMT_RGB_MASK 0xFFFFFF00
+#define IMGFMT_RGB (('R'<<24)|('G'<<16)|('B'<<8))
+#define IMGFMT_RGB1  (IMGFMT_RGB|1)
+#define IMGFMT_RGB4  (IMGFMT_RGB|4)
+#define IMGFMT_RGB4_CHAR  (IMGFMT_RGB|4|128) // RGB4 with 1 pixel per byte
+#define IMGFMT_RGB8  (IMGFMT_RGB|8)
+#define IMGFMT_RGB12 (IMGFMT_RGB|12)
+#define IMGFMT_RGB15 (IMGFMT_RGB|15)
+#define IMGFMT_RGB16 (IMGFMT_RGB|16)
+#define IMGFMT_RGB24 (IMGFMT_RGB|24)
+#define IMGFMT_RGB32 (IMGFMT_RGB|32)
+#define IMGFMT_RGB48LE (IMGFMT_RGB|48)
+#define IMGFMT_RGB48BE (IMGFMT_RGB|48|128)
+#define IMGFMT_RGB64LE (IMGFMT_RGB|64)
+#define IMGFMT_RGB64BE (IMGFMT_RGB|64|128)
+
+#define IMGFMT_BGR_MASK 0xFFFFFF00
+#define IMGFMT_BGR (('B'<<24)|('G'<<16)|('R'<<8))
+#define IMGFMT_BGR1  (IMGFMT_BGR|1)
+#define IMGFMT_BGR4  (IMGFMT_BGR|4)
+#define IMGFMT_BGR4_CHAR (IMGFMT_BGR|4|128) // BGR4 with 1 pixel per byte
+#define IMGFMT_BGR8  (IMGFMT_BGR|8)
+#define IMGFMT_BGR12 (IMGFMT_BGR|12)
+#define IMGFMT_BGR15 (IMGFMT_BGR|15)
+#define IMGFMT_BGR16 (IMGFMT_BGR|16)
+#define IMGFMT_BGR24 (IMGFMT_BGR|24)
+#define IMGFMT_BGR32 (IMGFMT_BGR|32)
+
+#define IMGFMT_XYZ_MASK 0xFFFFFF00
+#define IMGFMT_XYZ (('X'<<24)|('Y'<<16)|('Z'<<8))
+#define IMGFMT_XYZ12LE (IMGFMT_XYZ|12)
+#define IMGFMT_XYZ12BE (IMGFMT_XYZ|12|128)
+
+#define IMGFMT_GBR24P (('G'<<24)|('B'<<16)|('R'<<8)|24)
+#define IMGFMT_GBR12PLE (('G'<<24)|('B'<<16)|('R'<<8)|36)
+#define IMGFMT_GBR12PBE (('G'<<24)|('B'<<16)|('R'<<8)|36|128)
+#define IMGFMT_GBR14PLE (('G'<<24)|('B'<<16)|('R'<<8)|42)
+#define IMGFMT_GBR14PBE (('G'<<24)|('B'<<16)|('R'<<8)|42|128)
+
+#if HAVE_BIGENDIAN
+#define IMGFMT_ABGR    IMGFMT_RGB32
+#define IMGFMT_BGRA    (IMGFMT_RGB32|128)
+#define IMGFMT_ARGB    IMGFMT_BGR32
+#define IMGFMT_RGBA    (IMGFMT_BGR32|128)
+#define IMGFMT_RGB64NE IMGFMT_RGB64BE
+#define IMGFMT_RGB48NE IMGFMT_RGB48BE
+#define IMGFMT_RGB12BE IMGFMT_RGB12
+#define IMGFMT_RGB12LE (IMGFMT_RGB12|128)
+#define IMGFMT_RGB15BE IMGFMT_RGB15
+#define IMGFMT_RGB15LE (IMGFMT_RGB15|128)
+#define IMGFMT_RGB16BE IMGFMT_RGB16
+#define IMGFMT_RGB16LE (IMGFMT_RGB16|128)
+#define IMGFMT_BGR12BE IMGFMT_BGR12
+#define IMGFMT_BGR12LE (IMGFMT_BGR12|128)
+#define IMGFMT_BGR15BE IMGFMT_BGR15
+#define IMGFMT_BGR15LE (IMGFMT_BGR15|128)
+#define IMGFMT_BGR16BE IMGFMT_BGR16
+#define IMGFMT_BGR16LE (IMGFMT_BGR16|128)
+#define IMGFMT_XYZ12  IMGFMT_XYZ12BE
+#define IMGFMT_GBR12P IMGFMT_GBR12PBE
+#define IMGFMT_GBR14P IMGFMT_GBR14PBE
+#else
+#define IMGFMT_ABGR (IMGFMT_BGR32|128)
+#define IMGFMT_BGRA IMGFMT_BGR32
+#define IMGFMT_ARGB (IMGFMT_RGB32|128)
+#define IMGFMT_RGBA IMGFMT_RGB32
+#define IMGFMT_RGB64NE IMGFMT_RGB64LE
+#define IMGFMT_RGB48NE IMGFMT_RGB48LE
+#define IMGFMT_RGB12BE (IMGFMT_RGB12|128)
+#define IMGFMT_RGB12LE IMGFMT_RGB12
+#define IMGFMT_RGB15BE (IMGFMT_RGB15|128)
+#define IMGFMT_RGB15LE IMGFMT_RGB15
+#define IMGFMT_RGB16BE (IMGFMT_RGB16|128)
+#define IMGFMT_RGB16LE IMGFMT_RGB16
+#define IMGFMT_BGR12BE (IMGFMT_BGR12|128)
+#define IMGFMT_BGR12LE IMGFMT_BGR12
+#define IMGFMT_BGR15BE (IMGFMT_BGR15|128)
+#define IMGFMT_BGR15LE IMGFMT_BGR15
+#define IMGFMT_BGR16BE (IMGFMT_BGR16|128)
+#define IMGFMT_BGR16LE IMGFMT_BGR16
+#define IMGFMT_XYZ12  IMGFMT_XYZ12LE
+#define IMGFMT_GBR12P IMGFMT_GBR12PLE
+#define IMGFMT_GBR14P IMGFMT_GBR14PLE
+#endif
+
+/* old names for compatibility */
+#define IMGFMT_RG4B  IMGFMT_RGB4_CHAR
+#define IMGFMT_BG4B  IMGFMT_BGR4_CHAR
+
+#define IMGFMT_IS_RGB(fmt) (((fmt)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
+#define IMGFMT_IS_BGR(fmt) (((fmt)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
+#define IMGFMT_IS_XYZ(fmt) (((fmt)&IMGFMT_XYZ_MASK)==IMGFMT_XYZ)
+
+#define IMGFMT_RGB_DEPTH(fmt) ((fmt)&0x7F)
+#define IMGFMT_BGR_DEPTH(fmt) ((fmt)&0x7F)
+#define IMGFMT_XYZ_DEPTH(fmt) ((fmt)&0x7F)
+
+
+/* Planar YUV Formats */
+
+#define IMGFMT_YVU9 0x39555659
+#define IMGFMT_IF09 0x39304649
+#define IMGFMT_YV12 0x32315659
+#define IMGFMT_I420 0x30323449
+#define IMGFMT_IYUV 0x56555949
+#define IMGFMT_CLPL 0x4C504C43
+#define IMGFMT_Y800 0x30303859
+#define IMGFMT_Y8   0x20203859
+#define IMGFMT_NV12 0x3231564E
+#define IMGFMT_NV21 0x3132564E
+#define IMGFMT_Y16_LE 0x20363159
+
+/* unofficial Planar Formats, FIXME if official 4CC exists */
+#define IMGFMT_444P 0x50343434
+#define IMGFMT_422P 0x50323234
+#define IMGFMT_411P 0x50313134
+#define IMGFMT_440P 0x50303434
+#define IMGFMT_HM12 0x32314D48
+#define IMGFMT_Y16_BE 0x59313620
+
+// Gray with alpha
+#define IMGFMT_Y8A 0x59320008
+// 4:2:0 planar with alpha
+#define IMGFMT_420A 0x41303234
+// 4:2:2 planar with alpha
+#define IMGFMT_422A 0x41323234
+// 4:4:4 planar with alpha
+#define IMGFMT_444A 0x41343434
+
+#define IMGFMT_444P16_LE 0x51343434
+#define IMGFMT_444P16_BE 0x34343451
+#define IMGFMT_444P14_LE 0x54343434
+#define IMGFMT_444P14_BE 0x34343454
+#define IMGFMT_444P12_LE 0x55343434
+#define IMGFMT_444P12_BE 0x34343455
+#define IMGFMT_444P10_LE 0x52343434
+#define IMGFMT_444P10_BE 0x34343452
+#define IMGFMT_444P9_LE  0x53343434
+#define IMGFMT_444P9_BE  0x34343453
+#define IMGFMT_422P16_LE 0x51323234
+#define IMGFMT_422P16_BE 0x34323251
+#define IMGFMT_422P14_LE 0x54323234
+#define IMGFMT_422P14_BE 0x34323254
+#define IMGFMT_422P12_LE 0x55323234
+#define IMGFMT_422P12_BE 0x34323255
+#define IMGFMT_422P10_LE 0x52323234
+#define IMGFMT_422P10_BE 0x34323252
+#define IMGFMT_422P9_LE  0x53323234
+#define IMGFMT_422P9_BE  0x34323253
+#define IMGFMT_420P16_LE 0x51303234
+#define IMGFMT_420P16_BE 0x34323051
+#define IMGFMT_420P14_LE 0x54303234
+#define IMGFMT_420P14_BE 0x34323054
+#define IMGFMT_420P12_LE 0x55303234
+#define IMGFMT_420P12_BE 0x34323055
+#define IMGFMT_420P10_LE 0x52303234
+#define IMGFMT_420P10_BE 0x34323052
+#define IMGFMT_420P9_LE  0x53303234
+#define IMGFMT_420P9_BE  0x34323053
+#if HAVE_BIGENDIAN
+#define IMGFMT_444P16 IMGFMT_444P16_BE
+#define IMGFMT_444P14 IMGFMT_444P14_BE
+#define IMGFMT_444P12 IMGFMT_444P12_BE
+#define IMGFMT_444P10 IMGFMT_444P10_BE
+#define IMGFMT_444P9  IMGFMT_444P9_BE
+#define IMGFMT_422P16 IMGFMT_422P16_BE
+#define IMGFMT_422P14 IMGFMT_422P14_BE
+#define IMGFMT_422P12 IMGFMT_422P12_BE
+#define IMGFMT_422P10 IMGFMT_422P10_BE
+#define IMGFMT_422P9  IMGFMT_422P9_BE
+#define IMGFMT_420P16 IMGFMT_420P16_BE
+#define IMGFMT_420P14 IMGFMT_420P14_BE
+#define IMGFMT_420P12 IMGFMT_420P12_BE
+#define IMGFMT_420P10 IMGFMT_420P10_BE
+#define IMGFMT_420P9  IMGFMT_420P9_BE
+#define IMGFMT_Y16    IMGFMT_Y16_BE
+#define IMGFMT_IS_YUVP16_NE(fmt) IMGFMT_IS_YUVP16_BE(fmt)
+#else
+#define IMGFMT_444P16 IMGFMT_444P16_LE
+#define IMGFMT_444P14 IMGFMT_444P14_LE
+#define IMGFMT_444P12 IMGFMT_444P12_LE
+#define IMGFMT_444P10 IMGFMT_444P10_LE
+#define IMGFMT_444P9  IMGFMT_444P9_LE
+#define IMGFMT_422P16 IMGFMT_422P16_LE
+#define IMGFMT_422P14 IMGFMT_422P14_LE
+#define IMGFMT_422P12 IMGFMT_422P12_LE
+#define IMGFMT_422P10 IMGFMT_422P10_LE
+#define IMGFMT_422P9  IMGFMT_422P9_LE
+#define IMGFMT_420P16 IMGFMT_420P16_LE
+#define IMGFMT_420P14 IMGFMT_420P14_LE
+#define IMGFMT_420P12 IMGFMT_420P12_LE
+#define IMGFMT_420P10 IMGFMT_420P10_LE
+#define IMGFMT_420P9  IMGFMT_420P9_LE
+#define IMGFMT_Y16    IMGFMT_Y16_LE
+#define IMGFMT_IS_YUVP16_NE(fmt) IMGFMT_IS_YUVP16_LE(fmt)
+#endif
+
+#define IMGFMT_IS_YUVP16_LE(fmt) (((fmt - 0x51000034) & 0xfc0000ff) == 0)
+#define IMGFMT_IS_YUVP16_BE(fmt) (((fmt - 0x34000051) & 0xff0000fc) == 0)
+#define IMGFMT_IS_YUVP16(fmt)    (IMGFMT_IS_YUVP16_LE(fmt) || IMGFMT_IS_YUVP16_BE(fmt))
+
+/**
+ * \brief Find the corresponding full 16 bit format, i.e. IMGFMT_420P10_LE -> IMGFMT_420P16_LE
+ * \return normalized format ID or 0 if none exists.
+ */
+static inline int normalize_yuvp16(int fmt) {
+    if (IMGFMT_IS_YUVP16_LE(fmt))
+        return (fmt & 0x00ffffff) | 0x51000000;
+    if (IMGFMT_IS_YUVP16_BE(fmt))
+        return (fmt & 0xffffff00) | 0x00000051;
+    return 0;
+}
+
+/* Packed YUV Formats */
+
+#define IMGFMT_IUYV 0x56595549 // Interlaced UYVY
+#define IMGFMT_IY41 0x31435949 // Interlaced Y41P
+#define IMGFMT_IYU1 0x31555949
+#define IMGFMT_IYU2 0x32555949
+#define IMGFMT_UYVY 0x59565955
+#define IMGFMT_UYNV 0x564E5955 // Exactly same as UYVY
+#define IMGFMT_cyuv 0x76757963 // upside-down UYVY
+#define IMGFMT_Y422 0x32323459 // Exactly same as UYVY
+#define IMGFMT_YUY2 0x32595559
+#define IMGFMT_YUNV 0x564E5559 // Exactly same as YUY2
+#define IMGFMT_YVYU 0x55595659
+#define IMGFMT_Y41P 0x50313459
+#define IMGFMT_Y211 0x31313259
+#define IMGFMT_Y41T 0x54313459 // Y41P, Y lsb = transparency
+#define IMGFMT_Y42T 0x54323459 // UYVY, Y lsb = transparency
+#define IMGFMT_V422 0x32323456 // upside-down UYVY?
+#define IMGFMT_V655 0x35353656
+#define IMGFMT_CLJR 0x524A4C43
+#define IMGFMT_YUVP 0x50565559 // 10-bit YUYV
+#define IMGFMT_UYVP 0x50565955 // 10-bit UYVY
+
+/* Compressed Formats */
+#define IMGFMT_MPEGPES (('M'<<24)|('P'<<16)|('E'<<8)|('S'))
+#define IMGFMT_MJPEG (('M')|('J'<<8)|('P'<<16)|('G'<<24))
+/* Formats that are understood by zoran chips, we include
+ * non-interlaced, interlaced top-first, interlaced bottom-first */
+#define IMGFMT_ZRMJPEGNI  (('Z'<<24)|('R'<<16)|('N'<<8)|('I'))
+#define IMGFMT_ZRMJPEGIT (('Z'<<24)|('R'<<16)|('I'<<8)|('T'))
+#define IMGFMT_ZRMJPEGIB (('Z'<<24)|('R'<<16)|('I'<<8)|('B'))
+
+// I think that this code could not be used by any other codec/format
+#define IMGFMT_XVMC 0x1DC70000
+#define IMGFMT_XVMC_MASK 0xFFFF0000
+#define IMGFMT_IS_XVMC(fmt) (((fmt)&IMGFMT_XVMC_MASK)==IMGFMT_XVMC)
+//these are chroma420
+#define IMGFMT_XVMC_MOCO_MPEG2 (IMGFMT_XVMC|0x02)
+#define IMGFMT_XVMC_IDCT_MPEG2 (IMGFMT_XVMC|0x82)
+
+// VDPAU specific format.
+#define IMGFMT_VDPAU               0x1DC80000
+#define IMGFMT_VDPAU_MASK          0xFFFF0000
+#define IMGFMT_IS_VDPAU(fmt)       (((fmt)&IMGFMT_VDPAU_MASK)==IMGFMT_VDPAU)
+#define IMGFMT_VDPAU_MPEG1         (IMGFMT_VDPAU|0x01)
+#define IMGFMT_VDPAU_MPEG2         (IMGFMT_VDPAU|0x02)
+#define IMGFMT_VDPAU_H264          (IMGFMT_VDPAU|0x03)
+#define IMGFMT_VDPAU_WMV3          (IMGFMT_VDPAU|0x04)
+#define IMGFMT_VDPAU_VC1           (IMGFMT_VDPAU|0x05)
+#define IMGFMT_VDPAU_MPEG4         (IMGFMT_VDPAU|0x06)
+
+#define IMGFMT_IS_HWACCEL(fmt) (IMGFMT_IS_VDPAU(fmt) || IMGFMT_IS_XVMC(fmt))
+
+typedef struct {
+    void* data;
+    int size;
+    int id;        // stream id. usually 0x1E0
+    int timestamp; // pts, 90000 Hz counter based
+} vo_mpegpes_t;
+
+const char *ff_vo_format_name(int format);
+
+/**
+ * Calculates the scale shifts for the chroma planes for planar YUV
+ *
+ * \param component_bits bits per component
+ * \return bits-per-pixel for format if successful (i.e. format is 3 or 4-planes planar YUV), 0 otherwise
+ */
+int ff_mp_get_chroma_shift(int format, int *x_shift, int *y_shift, int *component_bits);
+
+#endif /* MPLAYER_IMG_FORMAT_H */
diff --git a/libavfilter/libmpcodecs/libvo/fastmemcpy.h b/libavfilter/libmpcodecs/libvo/fastmemcpy.h
new file mode 100644
index 0000000000..5a17d0192a
--- /dev/null
+++ b/libavfilter/libmpcodecs/libvo/fastmemcpy.h
@@ -0,0 +1,99 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with MPlayer; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef MPLAYER_FASTMEMCPY_H
+#define MPLAYER_FASTMEMCPY_H
+
+#include <inttypes.h>
+#include <string.h>
+#include <stddef.h>
+
+void * fast_memcpy(void * to, const void * from, size_t len);
+void * mem2agpcpy(void * to, const void * from, size_t len);
+
+#if ! defined(CONFIG_FASTMEMCPY) || ! (HAVE_MMX || HAVE_MMX2 || HAVE_AMD3DNOW /* || HAVE_SSE || HAVE_SSE2 */)
+#define mem2agpcpy(a,b,c) memcpy(a,b,c)
+#define fast_memcpy(a,b,c) memcpy(a,b,c)
+#endif
+
+static inline void * mem2agpcpy_pic(void * dst, const void * src, int bytesPerLine, int height, int dstStride, int srcStride)
+{
+    int i;
+    void *retval=dst;
+
+    if(dstStride == srcStride)
+    {
+        if (srcStride < 0) {
+                src = (const uint8_t*)src + (height-1)*srcStride;
+                dst = (uint8_t*)dst + (height-1)*dstStride;
+                srcStride = -srcStride;
+        }
+
+        mem2agpcpy(dst, src, srcStride*height);
+    }
+    else
+    {
+        for(i=0; i<height; i++)
+        {
+            mem2agpcpy(dst, src, bytesPerLine);
+            src = (const uint8_t*)src + srcStride;
+            dst = (uint8_t*)dst + dstStride;
+        }
+    }
+
+    return retval;
+}
+
+#define memcpy_pic(d, s, b, h, ds, ss) memcpy_pic2(d, s, b, h, ds, ss, 0)
+#define my_memcpy_pic(d, s, b, h, ds, ss) memcpy_pic2(d, s, b, h, ds, ss, 1)
+
+/**
+ * \param limit2width always skip data between end of line and start of next
+ *                    instead of copying the full block when strides are the same
+ */
+static inline void * memcpy_pic2(void * dst, const void * src,
+                                 int bytesPerLine, int height,
+                                 int dstStride, int srcStride, int limit2width)
+{
+    int i;
+    void *retval=dst;
+
+    if(!limit2width && dstStride == srcStride)
+    {
+        if (srcStride < 0) {
+                src = (const uint8_t*)src + (height-1)*srcStride;
+                dst = (uint8_t*)dst + (height-1)*dstStride;
+                srcStride = -srcStride;
+        }
+
+        fast_memcpy(dst, src, srcStride*height);
+    }
+    else
+    {
+        for(i=0; i<height; i++)
+        {
+            fast_memcpy(dst, src, bytesPerLine);
+            src = (const uint8_t*)src + srcStride;
+            dst = (uint8_t*)dst + dstStride;
+        }
+    }
+
+    return retval;
+}
+
+#endif /* MPLAYER_FASTMEMCPY_H */
diff --git a/libavfilter/libmpcodecs/libvo/video_out.h b/libavfilter/libmpcodecs/libvo/video_out.h
new file mode 100644
index 0000000000..49d30987ff
--- /dev/null
+++ b/libavfilter/libmpcodecs/libvo/video_out.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (C) Aaron Holtzman - Aug 1999
+ * Strongly modified, most parts rewritten: A'rpi/ESP-team - 2000-2001
+ * (C) MPlayer developers
+ *
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef MPLAYER_VIDEO_OUT_H
+#define MPLAYER_VIDEO_OUT_H
+
+#include <inttypes.h>
+#include <stdarg.h>
+
+//#include "sub/font_load.h"
+#include "../img_format.h"
+//#include "vidix/vidix.h"
+
+
+#define ROTATE(t, x, y) do { \
+  t rot_tmp = x; \
+  x = y; \
+  y = -rot_tmp; \
+} while(0)
+
+#define VO_EVENT_EXPOSE 1
+#define VO_EVENT_RESIZE 2
+#define VO_EVENT_KEYPRESS 4
+#define VO_EVENT_REINIT 8
+#define VO_EVENT_MOVE 16
+#define VO_EVENT_MOUSE 32
+
+/* Obsolete: VOCTRL_QUERY_VAA 1 */
+/* does the device support the required format */
+#define VOCTRL_QUERY_FORMAT 2
+/* signal a device reset seek */
+#define VOCTRL_RESET 3
+/* true if vo driver can use GUI created windows */
+#define VOCTRL_GUISUPPORT 4
+/* used to switch to fullscreen */
+#define VOCTRL_FULLSCREEN 5
+/* signal a device pause */
+#define VOCTRL_PAUSE 7
+/* start/resume playback */
+#define VOCTRL_RESUME 8
+/* libmpcodecs direct rendering: */
+#define VOCTRL_GET_IMAGE 9
+#define VOCTRL_DRAW_IMAGE 13
+#define VOCTRL_SET_SPU_PALETTE 14
+/* decoding ahead: */
+#define VOCTRL_GET_NUM_FRAMES 10
+#define VOCTRL_GET_FRAME_NUM  11
+#define VOCTRL_SET_FRAME_NUM  12
+#define VOCTRL_GET_PANSCAN 15
+#define VOCTRL_SET_PANSCAN 16
+/* equalizer controls */
+#define VOCTRL_SET_EQUALIZER 17
+#define VOCTRL_GET_EQUALIZER 18
+/* Frame duplication */
+#define VOCTRL_DUPLICATE_FRAME 20
+// ... 21
+#define VOCTRL_START_SLICE 21
+
+#define VOCTRL_ONTOP 25
+#define VOCTRL_ROOTWIN 26
+#define VOCTRL_BORDER 27
+#define VOCTRL_DRAW_EOSD 28
+#define VOCTRL_GET_EOSD_RES 29
+
+#define VOCTRL_SET_DEINTERLACE 30
+#define VOCTRL_GET_DEINTERLACE 31
+
+#define VOCTRL_UPDATE_SCREENINFO 32
+
+// Vo can be used by xover
+#define VOCTRL_XOVERLAY_SUPPORT 22
+
+#define VOCTRL_XOVERLAY_SET_COLORKEY 24
+typedef struct {
+  uint32_t x11; // The raw x11 color
+  uint16_t r,g,b;
+} mp_colorkey_t;
+
+#define VOCTRL_XOVERLAY_SET_WIN 23
+typedef struct {
+  int x,y;
+  int w,h;
+} mp_win_t;
+
+#define VO_TRUE      1
+#define VO_FALSE     0
+#define VO_ERROR    -1
+#define VO_NOTAVAIL -2
+#define VO_NOTIMPL  -3
+
+#define VOFLAG_FULLSCREEN         0x01
+#define VOFLAG_MODESWITCHING      0x02
+#define VOFLAG_SWSCALE            0x04
+#define VOFLAG_FLIPPING           0x08
+#define VOFLAG_HIDDEN             0x10  //< Use to create a hidden window
+#define VOFLAG_STEREO             0x20  //< Use to create a stereo-capable window
+#define VOFLAG_DEPTH              0x40  //< Request a depth buffer
+#define VOFLAG_XOVERLAY_SUB_VO 0x10000
+
+typedef struct vo_info_s
+{
+    /* driver name ("Matrox Millennium G200/G400" */
+    const char *name;
+    /* short name (for config strings) ("mga") */
+    const char *short_name;
+    /* author ("Aaron Holtzman <aholtzma@ess.engr.uvic.ca>") */
+    const char *author;
+    /* any additional comments */
+    const char *comment;
+} vo_info_t;
+
+typedef struct vo_functions_s
+{
+    const vo_info_t *info;
+    /*
+     * Preinitializes driver (real INITIALIZATION)
+     *   arg - currently it's vo_subdevice
+     *   returns: zero on successful initialization, non-zero on error.
+     */
+    int (*preinit)(const char *arg);
+    /*
+     * Initialize (means CONFIGURE) the display driver.
+     * params:
+     *   width,height: image source size
+     *   d_width,d_height: size of the requested window size, just a hint
+     *   fullscreen: flag, 0=windowd 1=fullscreen, just a hint
+     *   title: window title, if available
+     *   format: fourcc of pixel format
+     * returns : zero on successful initialization, non-zero on error.
+     */
+    int (*config)(uint32_t width, uint32_t height, uint32_t d_width,
+                  uint32_t d_height, uint32_t fullscreen, char *title,
+                  uint32_t format);
+
+    /*
+     * Control interface
+     */
+    int (*control)(uint32_t request, void *data, ...);
+
+    /*
+     * Display a new RGB/BGR frame of the video to the screen.
+     * params:
+     *   src[0] - pointer to the image
+     */
+    int (*draw_frame)(uint8_t *src[]);
+
+    /*
+     * Draw a planar YUV slice to the buffer:
+     * params:
+     *   src[3] = source image planes (Y,U,V)
+     *   stride[3] = source image planes line widths (in bytes)
+     *   w,h = width*height of area to be copied (in Y pixels)
+     *   x,y = position at the destination image (in Y pixels)
+     */
+    int (*draw_slice)(uint8_t *src[], int stride[], int w,int h, int x,int y);
+
+    /*
+     * Draws OSD to the screen buffer
+     */
+    void (*draw_osd)(void);
+
+    /*
+     * Blit/Flip buffer to the screen. Must be called after each frame!
+     */
+    void (*flip_page)(void);
+
+    /*
+     * This func is called after every frames to handle keyboard and
+     * other events. It's called in PAUSE mode too!
+     */
+    void (*check_events)(void);
+
+    /*
+     * Closes driver. Should restore the original state of the system.
+     */
+    void (*uninit)(void);
+} vo_functions_t;
+
+const vo_functions_t* init_best_video_out(char** vo_list);
+int config_video_out(const vo_functions_t *vo, uint32_t width, uint32_t height,
+                     uint32_t d_width, uint32_t d_height, uint32_t flags,
+                     char *title, uint32_t format);
+void list_video_out(void);
+
+// NULL terminated array of all drivers
+extern const vo_functions_t* const video_out_drivers[];
+
+extern int vo_flags;
+
+extern int vo_config_count;
+
+extern int xinerama_screen;
+extern int xinerama_x;
+extern int xinerama_y;
+
+// correct resolution/bpp on screen:  (should be autodetected by vo_init())
+extern int vo_depthonscreen;
+extern int vo_screenwidth;
+extern int vo_screenheight;
+
+// requested resolution/bpp:  (-x -y -bpp options)
+extern int vo_dx;
+extern int vo_dy;
+extern int vo_dwidth;
+extern int vo_dheight;
+extern int vo_dbpp;
+
+extern int vo_grabpointer;
+extern int vo_doublebuffering;
+extern int vo_directrendering;
+extern int vo_vsync;
+extern int vo_fsmode;
+extern float vo_panscan;
+extern float vo_border_pos_x;
+extern float vo_border_pos_y;
+extern int vo_rotate;
+extern int vo_adapter_num;
+extern int vo_refresh_rate;
+extern int vo_keepaspect;
+extern int vo_rootwin;
+extern int vo_ontop;
+extern int vo_border;
+
+extern int vo_gamma_gamma;
+extern int vo_gamma_brightness;
+extern int vo_gamma_saturation;
+extern int vo_gamma_contrast;
+extern int vo_gamma_hue;
+extern int vo_gamma_red_intensity;
+extern int vo_gamma_green_intensity;
+extern int vo_gamma_blue_intensity;
+
+extern int vo_nomouse_input;
+extern int enable_mouse_movements;
+
+extern int vo_pts;
+extern float vo_fps;
+
+extern char *vo_subdevice;
+
+extern int vo_colorkey;
+
+extern char *vo_winname;
+extern char *vo_wintitle;
+
+extern int64_t WinID;
+
+typedef struct {
+        float min;
+        float max;
+        } range_t;
+
+float range_max(range_t *r);
+int in_range(range_t *r, float f);
+range_t *str2range(char *s);
+extern char *monitor_hfreq_str;
+extern char *monitor_vfreq_str;
+extern char *monitor_dotclock_str;
+
+struct mp_keymap {
+  int from;
+  int to;
+};
+int lookup_keymap_table(const struct mp_keymap *map, int key);
+struct vo_rect {
+  int left, right, top, bottom, width, height;
+};
+void calc_src_dst_rects(int src_width, int src_height, struct vo_rect *src, struct vo_rect *dst,
+                        struct vo_rect *borders, const struct vo_rect *crop);
+void vo_mouse_movement(int posx, int posy);
+
+static inline int apply_border_pos(int full, int part, float pos) {
+  if (pos >= 0.0 && pos <= 1.0) {
+    return pos*(full - part);
+  }
+  if (pos < 0)
+    return pos * part;
+  return full - part + (pos - 1) * part;
+}
+
+#endif /* MPLAYER_VIDEO_OUT_H */
diff --git a/libavfilter/libmpcodecs/mp_image.c b/libavfilter/libmpcodecs/mp_image.c
new file mode 100644
index 0000000000..0e4d6d7591
--- /dev/null
+++ b/libavfilter/libmpcodecs/mp_image.c
@@ -0,0 +1,257 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#include "img_format.h"
+#include "mp_image.h"
+
+#include "libvo/fastmemcpy.h"
+//#include "libavutil/mem.h"
+#include "libavutil/imgutils.h"
+
+void ff_mp_image_alloc_planes(mp_image_t *mpi) {
+    uint32_t temp[256];
+    if (avpriv_set_systematic_pal2(temp, ff_mp2ff_pix_fmt(mpi->imgfmt)) >= 0)
+        mpi->flags |= MP_IMGFLAG_RGB_PALETTE;
+
+  // IF09 - allocate space for 4. plane delta info - unused
+  if (mpi->imgfmt == IMGFMT_IF09) {
+    mpi->planes[0]=av_malloc(mpi->bpp*mpi->width*(mpi->height+2)/8+
+                            mpi->chroma_width*mpi->chroma_height);
+  } else
+    mpi->planes[0]=av_malloc(mpi->bpp*mpi->width*(mpi->height+2)/8);
+  if (mpi->flags&MP_IMGFLAG_PLANAR) {
+    int bpp = IMGFMT_IS_YUVP16(mpi->imgfmt)? 2 : 1;
+    // YV12/I420/YVU9/IF09. feel free to add other planar formats here...
+    mpi->stride[0]=mpi->stride[3]=bpp*mpi->width;
+    if(mpi->num_planes > 2){
+      mpi->stride[1]=mpi->stride[2]=bpp*mpi->chroma_width;
+      if(mpi->flags&MP_IMGFLAG_SWAPPED){
+        // I420/IYUV  (Y,U,V)
+        mpi->planes[1]=mpi->planes[0]+mpi->stride[0]*mpi->height;
+        mpi->planes[2]=mpi->planes[1]+mpi->stride[1]*mpi->chroma_height;
+        if (mpi->num_planes > 3)
+            mpi->planes[3]=mpi->planes[2]+mpi->stride[2]*mpi->chroma_height;
+      } else {
+        // YV12,YVU9,IF09  (Y,V,U)
+        mpi->planes[2]=mpi->planes[0]+mpi->stride[0]*mpi->height;
+        mpi->planes[1]=mpi->planes[2]+mpi->stride[1]*mpi->chroma_height;
+        if (mpi->num_planes > 3)
+            mpi->planes[3]=mpi->planes[1]+mpi->stride[1]*mpi->chroma_height;
+      }
+    } else {
+      // NV12/NV21
+      mpi->stride[1]=mpi->chroma_width;
+      mpi->planes[1]=mpi->planes[0]+mpi->stride[0]*mpi->height;
+    }
+  } else {
+    mpi->stride[0]=mpi->width*mpi->bpp/8;
+    if (mpi->flags & MP_IMGFLAG_RGB_PALETTE) {
+      mpi->planes[1] = av_malloc(1024);
+      memcpy(mpi->planes[1], temp, 1024);
+    }
+  }
+  mpi->flags|=MP_IMGFLAG_ALLOCATED;
+}
+
+mp_image_t* ff_alloc_mpi(int w, int h, unsigned long int fmt) {
+  mp_image_t* mpi = ff_new_mp_image(w,h);
+
+  ff_mp_image_setfmt(mpi,fmt);
+  ff_mp_image_alloc_planes(mpi);
+
+  return mpi;
+}
+
+void ff_copy_mpi(mp_image_t *dmpi, mp_image_t *mpi) {
+  if(mpi->flags&MP_IMGFLAG_PLANAR){
+    memcpy_pic(dmpi->planes[0],mpi->planes[0], mpi->w, mpi->h,
+               dmpi->stride[0],mpi->stride[0]);
+    memcpy_pic(dmpi->planes[1],mpi->planes[1], mpi->chroma_width, mpi->chroma_height,
+               dmpi->stride[1],mpi->stride[1]);
+    memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->chroma_width, mpi->chroma_height,
+               dmpi->stride[2],mpi->stride[2]);
+  } else {
+    memcpy_pic(dmpi->planes[0],mpi->planes[0],
+               mpi->w*(dmpi->bpp/8), mpi->h,
+               dmpi->stride[0],mpi->stride[0]);
+  }
+}
+
+void ff_mp_image_setfmt(mp_image_t* mpi,unsigned int out_fmt){
+    mpi->flags&=~(MP_IMGFLAG_PLANAR|MP_IMGFLAG_YUV|MP_IMGFLAG_SWAPPED);
+    mpi->imgfmt=out_fmt;
+    // compressed formats
+    if(out_fmt == IMGFMT_MPEGPES ||
+       out_fmt == IMGFMT_ZRMJPEGNI || out_fmt == IMGFMT_ZRMJPEGIT || out_fmt == IMGFMT_ZRMJPEGIB ||
+       IMGFMT_IS_HWACCEL(out_fmt)){
+        mpi->bpp=0;
+        return;
+    }
+    mpi->num_planes=1;
+    if (IMGFMT_IS_RGB(out_fmt)) {
+        if (IMGFMT_RGB_DEPTH(out_fmt) < 8 && !(out_fmt&128))
+            mpi->bpp = IMGFMT_RGB_DEPTH(out_fmt);
+        else
+            mpi->bpp=(IMGFMT_RGB_DEPTH(out_fmt)+7)&(~7);
+        return;
+    }
+    if (IMGFMT_IS_BGR(out_fmt)) {
+        if (IMGFMT_BGR_DEPTH(out_fmt) < 8 && !(out_fmt&128))
+            mpi->bpp = IMGFMT_BGR_DEPTH(out_fmt);
+        else
+            mpi->bpp=(IMGFMT_BGR_DEPTH(out_fmt)+7)&(~7);
+        mpi->flags|=MP_IMGFLAG_SWAPPED;
+        return;
+    }
+    if (IMGFMT_IS_XYZ(out_fmt)) {
+        mpi->bpp=3*((IMGFMT_XYZ_DEPTH(out_fmt) + 7) & ~7);
+        return;
+    }
+    mpi->num_planes=3;
+    if (out_fmt == IMGFMT_GBR24P) {
+        mpi->bpp=24;
+        mpi->flags|=MP_IMGFLAG_PLANAR;
+        return;
+    } else if (out_fmt == IMGFMT_GBR12P) {
+        mpi->bpp=36;
+        mpi->flags|=MP_IMGFLAG_PLANAR;
+        return;
+    } else if (out_fmt == IMGFMT_GBR14P) {
+        mpi->bpp=42;
+        mpi->flags|=MP_IMGFLAG_PLANAR;
+        return;
+    }
+    mpi->flags|=MP_IMGFLAG_YUV;
+    if (ff_mp_get_chroma_shift(out_fmt, NULL, NULL, NULL)) {
+        mpi->flags|=MP_IMGFLAG_PLANAR;
+        mpi->bpp = ff_mp_get_chroma_shift(out_fmt, &mpi->chroma_x_shift, &mpi->chroma_y_shift, NULL);
+        mpi->chroma_width  = mpi->width  >> mpi->chroma_x_shift;
+        mpi->chroma_height = mpi->height >> mpi->chroma_y_shift;
+    }
+    switch(out_fmt){
+    case IMGFMT_I420:
+    case IMGFMT_IYUV:
+        mpi->flags|=MP_IMGFLAG_SWAPPED;
+    case IMGFMT_YV12:
+        return;
+    case IMGFMT_420A:
+    case IMGFMT_422A:
+    case IMGFMT_444A:
+    case IMGFMT_IF09:
+        mpi->num_planes=4;
+    case IMGFMT_YVU9:
+    case IMGFMT_444P:
+    case IMGFMT_422P:
+    case IMGFMT_411P:
+    case IMGFMT_440P:
+    case IMGFMT_444P16_LE:
+    case IMGFMT_444P16_BE:
+    case IMGFMT_444P14_LE:
+    case IMGFMT_444P14_BE:
+    case IMGFMT_444P12_LE:
+    case IMGFMT_444P12_BE:
+    case IMGFMT_444P10_LE:
+    case IMGFMT_444P10_BE:
+    case IMGFMT_444P9_LE:
+    case IMGFMT_444P9_BE:
+    case IMGFMT_422P16_LE:
+    case IMGFMT_422P16_BE:
+    case IMGFMT_422P14_LE:
+    case IMGFMT_422P14_BE:
+    case IMGFMT_422P12_LE:
+    case IMGFMT_422P12_BE:
+    case IMGFMT_422P10_LE:
+    case IMGFMT_422P10_BE:
+    case IMGFMT_422P9_LE:
+    case IMGFMT_422P9_BE:
+    case IMGFMT_420P16_LE:
+    case IMGFMT_420P16_BE:
+    case IMGFMT_420P14_LE:
+    case IMGFMT_420P14_BE:
+    case IMGFMT_420P12_LE:
+    case IMGFMT_420P12_BE:
+    case IMGFMT_420P10_LE:
+    case IMGFMT_420P10_BE:
+    case IMGFMT_420P9_LE:
+    case IMGFMT_420P9_BE:
+        return;
+    case IMGFMT_Y16_LE:
+    case IMGFMT_Y16_BE:
+        mpi->bpp=16;
+    case IMGFMT_Y800:
+    case IMGFMT_Y8:
+        /* they're planar ones, but for easier handling use them as packed */
+        mpi->flags&=~MP_IMGFLAG_PLANAR;
+        mpi->num_planes=1;
+        return;
+    case IMGFMT_Y8A:
+        mpi->num_planes=2;
+        return;
+    case IMGFMT_UYVY:
+        mpi->flags|=MP_IMGFLAG_SWAPPED;
+    case IMGFMT_YUY2:
+        mpi->chroma_x_shift = 1;
+        mpi->bpp=16;
+        mpi->num_planes=1;
+        return;
+    case IMGFMT_NV12:
+        mpi->flags|=MP_IMGFLAG_SWAPPED;
+    case IMGFMT_NV21:
+        mpi->flags|=MP_IMGFLAG_PLANAR;
+        mpi->bpp=12;
+        mpi->num_planes=2;
+        mpi->chroma_width=(mpi->width>>0);
+        mpi->chroma_height=(mpi->height>>1);
+        mpi->chroma_x_shift=0;
+        mpi->chroma_y_shift=1;
+        return;
+    }
+    ff_mp_msg(MSGT_DECVIDEO,MSGL_WARN,"mp_image: unknown out_fmt: 0x%X\n",out_fmt);
+    mpi->bpp=0;
+}
+
+mp_image_t* ff_new_mp_image(int w,int h){
+    mp_image_t* mpi = malloc(sizeof(mp_image_t));
+    if(!mpi) return NULL; // error!
+    memset(mpi,0,sizeof(mp_image_t));
+    mpi->width=mpi->w=w;
+    mpi->height=mpi->h=h;
+    return mpi;
+}
+
+void ff_free_mp_image(mp_image_t* mpi){
+    if(!mpi) return;
+    if(mpi->flags&MP_IMGFLAG_ALLOCATED){
+        /* because we allocate the whole image at once */
+        av_free(mpi->planes[0]);
+        if (mpi->flags & MP_IMGFLAG_RGB_PALETTE)
+            av_free(mpi->planes[1]);
+    }
+    free(mpi);
+}
+
diff --git a/libavfilter/libmpcodecs/mp_image.h b/libavfilter/libmpcodecs/mp_image.h
new file mode 100644
index 0000000000..aedf4510cd
--- /dev/null
+++ b/libavfilter/libmpcodecs/mp_image.h
@@ -0,0 +1,159 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef MPLAYER_MP_IMAGE_H
+#define MPLAYER_MP_IMAGE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#undef printf //FIXME
+#undef fprintf //FIXME
+#include "mp_msg.h"
+#include "libavutil/avutil.h"
+#include "libavutil/avassert.h"
+#undef realloc
+#undef malloc
+#undef free
+#undef rand
+#undef srand
+#undef printf
+#undef strncpy
+#define ASMALIGN(ZEROBITS) ".p2align " #ZEROBITS "\n\t"
+#define CODEC_FLAG2_MEMC_ONLY     0x00001000 ///< Only do ME/MC (I frames -> ref, P frame -> ME+MC).
+
+enum AVPixelFormat ff_mp2ff_pix_fmt(int mp);
+
+//--------- codec's requirements (filled by the codec/vf) ---------
+
+//--- buffer content restrictions:
+// set if buffer content shouldn't be modified:
+#define MP_IMGFLAG_PRESERVE 0x01
+// set if buffer content will be READ.
+// This can be e.g. for next frame's MC: (I/P mpeg frames) -
+// then in combination with MP_IMGFLAG_PRESERVE - or it
+// can be because a video filter or codec will read a significant
+// amount of data while processing that frame (e.g. blending something
+// onto the frame, MV based intra prediction).
+// A frame marked like this should not be placed in to uncachable
+// video RAM for example.
+#define MP_IMGFLAG_READABLE 0x02
+
+//--- buffer width/stride/plane restrictions: (used for direct rendering)
+// stride _have_to_ be aligned to MB boundary:  [for DR restrictions]
+#define MP_IMGFLAG_ACCEPT_ALIGNED_STRIDE 0x4
+// stride should be aligned to MB boundary:     [for buffer allocation]
+#define MP_IMGFLAG_PREFER_ALIGNED_STRIDE 0x8
+// codec accept any stride (>=width):
+#define MP_IMGFLAG_ACCEPT_STRIDE 0x10
+// codec accept any width (width*bpp=stride -> stride%bpp==0) (>=width):
+#define MP_IMGFLAG_ACCEPT_WIDTH 0x20
+//--- for planar formats only:
+// uses only stride[0], and stride[1]=stride[2]=stride[0]>>mpi->chroma_x_shift
+#define MP_IMGFLAG_COMMON_STRIDE 0x40
+// uses only planes[0], and calculates planes[1,2] from width,height,imgfmt
+#define MP_IMGFLAG_COMMON_PLANE 0x80
+
+#define MP_IMGFLAGMASK_RESTRICTIONS 0xFF
+
+//--------- color info (filled by ff_mp_image_setfmt() ) -----------
+// set if number of planes > 1
+#define MP_IMGFLAG_PLANAR 0x100
+// set if it's YUV colorspace
+#define MP_IMGFLAG_YUV 0x200
+// set if it's swapped (BGR or YVU) plane/byteorder
+#define MP_IMGFLAG_SWAPPED 0x400
+// set if you want memory for palette allocated and managed by ff_vf_get_image etc.
+#define MP_IMGFLAG_RGB_PALETTE 0x800
+
+#define MP_IMGFLAGMASK_COLORS 0xF00
+
+// codec uses drawing/rendering callbacks (draw_slice()-like thing, DR method 2)
+// [the codec will set this flag if it supports callbacks, and the vo _may_
+//  clear it in get_image() if draw_slice() not implemented]
+#define MP_IMGFLAG_DRAW_CALLBACK 0x1000
+// set if it's in video buffer/memory: [set by vo/vf's get_image() !!!]
+#define MP_IMGFLAG_DIRECT 0x2000
+// set if buffer is allocated (used in destination images):
+#define MP_IMGFLAG_ALLOCATED 0x4000
+
+// buffer type was printed (do NOT set this flag - it's for INTERNAL USE!!!)
+#define MP_IMGFLAG_TYPE_DISPLAYED 0x8000
+
+// codec doesn't support any form of direct rendering - it has own buffer
+// allocation. so we just export its buffer pointers:
+#define MP_IMGTYPE_EXPORT 0
+// codec requires a static WO buffer, but it does only partial updates later:
+#define MP_IMGTYPE_STATIC 1
+// codec just needs some WO memory, where it writes/copies the whole frame to:
+#define MP_IMGTYPE_TEMP 2
+// I+P type, requires 2+ independent static R/W buffers
+#define MP_IMGTYPE_IP 3
+// I+P+B type, requires 2+ independent static R/W and 1+ temp WO buffers
+#define MP_IMGTYPE_IPB 4
+// Upper 16 bits give desired buffer number, -1 means get next available
+#define MP_IMGTYPE_NUMBERED 5
+// Doesn't need any buffer, incomplete image (probably a first field only)
+// we need this type to be able to differentiate between half frames and
+// all other cases
+#define MP_IMGTYPE_INCOMPLETE 6
+
+#define MP_MAX_PLANES 4
+
+#define MP_IMGFIELD_ORDERED 0x01
+#define MP_IMGFIELD_TOP_FIRST 0x02
+#define MP_IMGFIELD_REPEAT_FIRST 0x04
+#define MP_IMGFIELD_TOP 0x08
+#define MP_IMGFIELD_BOTTOM 0x10
+#define MP_IMGFIELD_INTERLACED 0x20
+
+typedef struct mp_image {
+    unsigned int flags;
+    unsigned char type;
+    int number;
+    unsigned char bpp;  // bits/pixel. NOT depth! for RGB it will be n*8
+    unsigned int imgfmt;
+    int width,height;  // stored dimensions
+    int x,y,w,h;  // visible dimensions
+    unsigned char* planes[MP_MAX_PLANES];
+    int stride[MP_MAX_PLANES];
+    char * qscale;
+    int qstride;
+    int pict_type; // 0->unknown, 1->I, 2->P, 3->B
+    int fields;
+    int qscale_type; // 0->mpeg1/4/h263, 1->mpeg2
+    int num_planes;
+    /* these are only used by planar formats Y,U(Cb),V(Cr) */
+    int chroma_width;
+    int chroma_height;
+    int chroma_x_shift; // horizontal
+    int chroma_y_shift; // vertical
+    int usage_count;
+    /* for private use by filter or vo driver (to store buffer id or dmpi) */
+    void* priv;
+} mp_image_t;
+
+void ff_mp_image_setfmt(mp_image_t* mpi,unsigned int out_fmt);
+mp_image_t* ff_new_mp_image(int w,int h);
+void ff_free_mp_image(mp_image_t* mpi);
+
+mp_image_t* ff_alloc_mpi(int w, int h, unsigned long int fmt);
+void ff_mp_image_alloc_planes(mp_image_t *mpi);
+void ff_copy_mpi(mp_image_t *dmpi, mp_image_t *mpi);
+
+#endif /* MPLAYER_MP_IMAGE_H */
diff --git a/libavfilter/libmpcodecs/mp_msg.h b/libavfilter/libmpcodecs/mp_msg.h
new file mode 100644
index 0000000000..51cdff3cef
--- /dev/null
+++ b/libavfilter/libmpcodecs/mp_msg.h
@@ -0,0 +1,166 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef MPLAYER_MP_MSG_H
+#define MPLAYER_MP_MSG_H
+
+#include <stdarg.h>
+
+// defined in mplayer.c and mencoder.c
+extern int verbose;
+
+// verbosity elevel:
+
+/* Only messages level MSGL_FATAL-MSGL_STATUS should be translated,
+ * messages level MSGL_V and above should not be translated. */
+
+#define MSGL_FATAL 0  // will exit/abort
+#define MSGL_ERR 1    // continues
+#define MSGL_WARN 2   // only warning
+#define MSGL_HINT 3   // short help message
+#define MSGL_INFO 4   // -quiet
+#define MSGL_STATUS 5 // v=0
+#define MSGL_V 6      // v=1
+#define MSGL_DBG2 7   // v=2
+#define MSGL_DBG3 8   // v=3
+#define MSGL_DBG4 9   // v=4
+#define MSGL_DBG5 10  // v=5
+
+#define MSGL_FIXME 1  // for conversions from printf where the appropriate MSGL is not known; set equal to ERR for obtrusiveness
+#define MSGT_FIXME 0  // for conversions from printf where the appropriate MSGT is not known; set equal to GLOBAL for obtrusiveness
+
+// code/module:
+
+#define MSGT_GLOBAL 0        // common player stuff errors
+#define MSGT_CPLAYER 1       // console player (mplayer.c)
+#define MSGT_GPLAYER 2       // gui player
+
+#define MSGT_VO 3       // libvo
+#define MSGT_AO 4       // libao
+
+#define MSGT_DEMUXER 5    // demuxer.c (general stuff)
+#define MSGT_DS 6         // demux stream (add/read packet etc)
+#define MSGT_DEMUX 7      // fileformat-specific stuff (demux_*.c)
+#define MSGT_HEADER 8     // fileformat-specific header (*header.c)
+
+#define MSGT_AVSYNC 9     // mplayer.c timer stuff
+#define MSGT_AUTOQ 10     // mplayer.c auto-quality stuff
+
+#define MSGT_CFGPARSER 11 // cfgparser.c
+
+#define MSGT_DECAUDIO 12  // av decoder
+#define MSGT_DECVIDEO 13
+
+#define MSGT_SEEK 14    // seeking code
+#define MSGT_WIN32 15   // win32 dll stuff
+#define MSGT_OPEN 16    // open.c (stream opening)
+#define MSGT_DVD 17     // open.c (DVD init/read/seek)
+
+#define MSGT_PARSEES 18 // parse_es.c (mpeg stream parser)
+#define MSGT_LIRC 19    // lirc_mp.c and input lirc driver
+
+#define MSGT_STREAM 20  // stream.c
+#define MSGT_CACHE 21   // cache2.c
+
+#define MSGT_MENCODER 22
+
+#define MSGT_XACODEC 23 // XAnim codecs
+
+#define MSGT_TV 24      // TV input subsystem
+
+#define MSGT_OSDEP 25  // OS-dependent parts
+
+#define MSGT_SPUDEC 26 // spudec.c
+
+#define MSGT_PLAYTREE 27    // Playtree handeling (playtree.c, playtreeparser.c)
+
+#define MSGT_INPUT 28
+
+#define MSGT_VFILTER 29
+
+#define MSGT_OSD 30
+
+#define MSGT_NETWORK 31
+
+#define MSGT_CPUDETECT 32
+
+#define MSGT_CODECCFG 33
+
+#define MSGT_SWS 34
+
+#define MSGT_VOBSUB 35
+#define MSGT_SUBREADER 36
+
+#define MSGT_AFILTER 37  // Audio filter messages
+
+#define MSGT_NETST 38 // Netstream
+
+#define MSGT_MUXER 39 // muxer layer
+
+#define MSGT_OSD_MENU 40
+
+#define MSGT_IDENTIFY 41  // -identify output
+
+#define MSGT_RADIO 42
+
+#define MSGT_ASS 43 // libass messages
+
+#define MSGT_LOADER 44 // dll loader messages
+
+#define MSGT_STATUSLINE 45 // playback/encoding status line
+
+#define MSGT_TELETEXT 46       // Teletext decoder
+
+#define MSGT_MAX 64
+
+
+extern char *ff_mp_msg_charset;
+extern int ff_mp_msg_color;
+extern int ff_mp_msg_module;
+
+extern int ff_mp_msg_levels[MSGT_MAX];
+extern int ff_mp_msg_level_all;
+
+
+void ff_mp_msg_init(void);
+int ff_mp_msg_test(int mod, int lev);
+
+#include "config.h"
+
+void ff_mp_msg_va(int mod, int lev, const char *format, va_list va);
+#ifdef __GNUC__
+void ff_mp_msg(int mod, int lev, const char *format, ... ) __attribute__ ((format (printf, 3, 4)));
+#   ifdef MP_DEBUG
+#      define mp_dbg(mod,lev, args... ) ff_mp_msg(mod, lev, ## args )
+#   else
+       // only useful for developers, disable but check syntax
+#      define mp_dbg(mod,lev, args... ) do { if (0) ff_mp_msg(mod, lev, ## args ); } while (0)
+#   endif
+#else // not GNU C
+void ff_mp_msg(int mod, int lev, const char *format, ... );
+#   ifdef MP_DEBUG
+#      define mp_dbg(mod,lev, ... ) ff_mp_msg(mod, lev, __VA_ARGS__)
+#   else
+       // only useful for developers, disable but check syntax
+#      define mp_dbg(mod,lev, ... ) do { if (0) ff_mp_msg(mod, lev, __VA_ARGS__); } while (0)
+#   endif
+#endif /* __GNUC__ */
+
+const char* ff_filename_recode(const char* filename);
+
+#endif /* MPLAYER_MP_MSG_H */
diff --git a/libavfilter/libmpcodecs/mpc_info.h b/libavfilter/libmpcodecs/mpc_info.h
new file mode 100644
index 0000000000..8554699120
--- /dev/null
+++ b/libavfilter/libmpcodecs/mpc_info.h
@@ -0,0 +1,43 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef MPLAYER_MPC_INFO_H
+#define MPLAYER_MPC_INFO_H
+
+typedef struct mp_codec_info_s
+{
+        /* codec long name ("Autodesk FLI/FLC Animation decoder" */
+        const char *name;
+        /* short name (same as driver name in codecs.conf) ("dshow") */
+        const char *short_name;
+        /* interface author/maintainer */
+        const char *maintainer;
+        /* codec author ("Aaron Holtzman <aholtzma@ess.engr.uvic.ca>") */
+        const char *author;
+        /* any additional comments */
+        const char *comment;
+} mp_codec_info_t;
+
+#define CONTROL_OK 1
+#define CONTROL_TRUE 1
+#define CONTROL_FALSE 0
+#define CONTROL_UNKNOWN -1
+#define CONTROL_ERROR -2
+#define CONTROL_NA -3
+
+#endif /* MPLAYER_MPC_INFO_H */
diff --git a/libavfilter/libmpcodecs/vf.h b/libavfilter/libmpcodecs/vf.h
new file mode 100644
index 0000000000..d8fc66be47
--- /dev/null
+++ b/libavfilter/libmpcodecs/vf.h
@@ -0,0 +1,169 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef MPLAYER_VF_H
+#define MPLAYER_VF_H
+
+//#include "m_option.h"
+#include "mp_image.h"
+
+//extern m_obj_settings_t* vf_settings;
+//extern const m_obj_list_t vf_obj_list;
+
+struct vf_instance;
+struct vf_priv_s;
+
+typedef struct vf_info_s {
+    const char *info;
+    const char *name;
+    const char *author;
+    const char *comment;
+    int (*vf_open)(struct vf_instance *vf,char* args);
+    // Ptr to a struct dscribing the options
+    const void* opts;
+} vf_info_t;
+
+#define NUM_NUMBERED_MPI 50
+
+typedef struct vf_image_context_s {
+    mp_image_t* static_images[2];
+    mp_image_t* temp_images[1];
+    mp_image_t* export_images[1];
+    mp_image_t* numbered_images[NUM_NUMBERED_MPI];
+    int static_idx;
+} vf_image_context_t;
+
+typedef struct vf_format_context_t {
+    int have_configured;
+    int orig_width, orig_height, orig_fmt;
+} vf_format_context_t;
+
+typedef struct vf_instance {
+    const vf_info_t* info;
+    // funcs:
+    int (*config)(struct vf_instance *vf,
+        int width, int height, int d_width, int d_height,
+        unsigned int flags, unsigned int outfmt);
+    int (*control)(struct vf_instance *vf,
+        int request, void* data);
+    int (*query_format)(struct vf_instance *vf,
+        unsigned int fmt);
+    void (*get_image)(struct vf_instance *vf,
+        mp_image_t *mpi);
+    int (*put_image)(struct vf_instance *vf,
+        mp_image_t *mpi, double pts);
+    void (*start_slice)(struct vf_instance *vf,
+        mp_image_t *mpi);
+    void (*draw_slice)(struct vf_instance *vf,
+        unsigned char** src, int* stride, int w,int h, int x, int y);
+    void (*uninit)(struct vf_instance *vf);
+
+    int (*continue_buffered_image)(struct vf_instance *vf);
+    // caps:
+    unsigned int default_caps; // used by default query_format()
+    unsigned int default_reqs; // used by default config()
+    // data:
+    int w, h;
+    vf_image_context_t imgctx;
+    vf_format_context_t fmt;
+    struct vf_instance *next;
+    mp_image_t *dmpi;
+    struct vf_priv_s* priv;
+} vf_instance_t;
+
+// control codes:
+#include "mpc_info.h"
+
+typedef struct vf_seteq_s
+{
+    const char *item;
+    int value;
+} vf_equalizer_t;
+
+#define VFCTRL_QUERY_MAX_PP_LEVEL 4 /* test for postprocessing support (max level) */
+#define VFCTRL_SET_PP_LEVEL 5 /* set postprocessing level */
+#define VFCTRL_SET_EQUALIZER 6 /* set color options (brightness,contrast etc) */
+#define VFCTRL_GET_EQUALIZER 8 /* gset color options (brightness,contrast etc) */
+#define VFCTRL_DRAW_OSD 7
+#define VFCTRL_CHANGE_RECTANGLE 9 /* Change the rectangle boundaries */
+#define VFCTRL_FLIP_PAGE 10 /* Tell the vo to flip pages */
+#define VFCTRL_DUPLICATE_FRAME 11 /* For encoding - encode zero-change frame */
+#define VFCTRL_SKIP_NEXT_FRAME 12 /* For encoding - drop the next frame that passes through */
+#define VFCTRL_FLUSH_FRAMES    13 /* For encoding - flush delayed frames */
+#define VFCTRL_SCREENSHOT      14 /* Make a screenshot */
+#define VFCTRL_INIT_EOSD       15 /* Select EOSD renderer */
+#define VFCTRL_DRAW_EOSD       16 /* Render EOSD */
+#define VFCTRL_GET_PTS         17 /* Return last pts value that reached vf_vo*/
+#define VFCTRL_SET_DEINTERLACE 18 /* Set deinterlacing status */
+#define VFCTRL_GET_DEINTERLACE 19 /* Get deinterlacing status */
+
+#include "vfcap.h"
+
+//FIXME this should be in a common header, but i dunno which
+#define MP_NOPTS_VALUE (-1LL<<63) //both int64_t and double should be able to represent this exactly
+
+
+// functions:
+void ff_vf_mpi_clear(mp_image_t* mpi,int x0,int y0,int w,int h);
+mp_image_t* ff_vf_get_image(vf_instance_t* vf, unsigned int outfmt, int mp_imgtype, int mp_imgflag, int w, int h);
+
+vf_instance_t* vf_open_plugin(const vf_info_t* const* filter_list, vf_instance_t* next, const char *name, char **args);
+vf_instance_t* vf_open_filter(vf_instance_t* next, const char *name, char **args);
+vf_instance_t* ff_vf_add_before_vo(vf_instance_t **vf, char *name, char **args);
+vf_instance_t* vf_open_encoder(vf_instance_t* next, const char *name, char *args);
+
+unsigned int ff_vf_match_csp(vf_instance_t** vfp,const unsigned int* list,unsigned int preferred);
+void ff_vf_clone_mpi_attributes(mp_image_t* dst, mp_image_t* src);
+void ff_vf_queue_frame(vf_instance_t *vf, int (*)(vf_instance_t *));
+int ff_vf_output_queued_frame(vf_instance_t *vf);
+
+// default wrappers:
+int ff_vf_next_config(struct vf_instance *vf,
+        int width, int height, int d_width, int d_height,
+        unsigned int flags, unsigned int outfmt);
+int ff_vf_next_control(struct vf_instance *vf, int request, void* data);
+void ff_vf_extra_flip(struct vf_instance *vf);
+int ff_vf_next_query_format(struct vf_instance *vf, unsigned int fmt);
+int ff_vf_next_put_image(struct vf_instance *vf,mp_image_t *mpi, double pts);
+void ff_vf_next_draw_slice (struct vf_instance *vf, unsigned char** src, int* stride, int w,int h, int x, int y);
+
+vf_instance_t* ff_append_filters(vf_instance_t* last);
+
+void ff_vf_uninit_filter(vf_instance_t* vf);
+void ff_vf_uninit_filter_chain(vf_instance_t* vf);
+
+int ff_vf_config_wrapper(struct vf_instance *vf,
+                      int width, int height, int d_width, int d_height,
+                      unsigned int flags, unsigned int outfmt);
+
+static inline int norm_qscale(int qscale, int type)
+{
+    switch (type) {
+    case 0: // MPEG-1
+        return qscale;
+    case 1: // MPEG-2
+        return qscale >> 1;
+    case 2: // H264
+        return qscale >> 2;
+    case 3: // VP56
+        return (63 - qscale + 2) >> 2;
+    }
+    return qscale;
+}
+
+#endif /* MPLAYER_VF_H */
diff --git a/libavfilter/libmpcodecs/vf_eq.c b/libavfilter/libmpcodecs/vf_eq.c
new file mode 100644
index 0000000000..f8efa846c9
--- /dev/null
+++ b/libavfilter/libmpcodecs/vf_eq.c
@@ -0,0 +1,240 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+
+#include "config.h"
+#include "mp_msg.h"
+#include "cpudetect.h"
+
+#include "img_format.h"
+#include "mp_image.h"
+#include "vf.h"
+
+#include "libvo/video_out.h"
+
+struct vf_priv_s {
+        unsigned char *buf;
+        int brightness;
+        int contrast;
+};
+
+#if HAVE_MMX && HAVE_6REGS
+static void process_MMX(unsigned char *dest, int dstride, unsigned char *src, int sstride,
+                    int w, int h, int brightness, int contrast)
+{
+        int i;
+        int pel;
+        int dstep = dstride-w;
+        int sstep = sstride-w;
+        short brvec[4];
+        short contvec[4];
+
+        contrast = ((contrast+100)*256*16)/100;
+        brightness = ((brightness+100)*511)/200-128 - contrast/32;
+
+        brvec[0] = brvec[1] = brvec[2] = brvec[3] = brightness;
+        contvec[0] = contvec[1] = contvec[2] = contvec[3] = contrast;
+
+        while (h--) {
+                __asm__ volatile (
+                        "movq (%5), %%mm3 \n\t"
+                        "movq (%6), %%mm4 \n\t"
+                        "pxor %%mm0, %%mm0 \n\t"
+                        "movl %4, %%eax\n\t"
+                        ASMALIGN(4)
+                        "1: \n\t"
+                        "movq (%0), %%mm1 \n\t"
+                        "movq (%0), %%mm2 \n\t"
+                        "punpcklbw %%mm0, %%mm1 \n\t"
+                        "punpckhbw %%mm0, %%mm2 \n\t"
+                        "psllw $4, %%mm1 \n\t"
+                        "psllw $4, %%mm2 \n\t"
+                        "pmulhw %%mm4, %%mm1 \n\t"
+                        "pmulhw %%mm4, %%mm2 \n\t"
+                        "paddw %%mm3, %%mm1 \n\t"
+                        "paddw %%mm3, %%mm2 \n\t"
+                        "packuswb %%mm2, %%mm1 \n\t"
+                        "add $8, %0 \n\t"
+                        "movq %%mm1, (%1) \n\t"
+                        "add $8, %1 \n\t"
+                        "decl %%eax \n\t"
+                        "jnz 1b \n\t"
+                        : "=r" (src), "=r" (dest)
+                        : "0" (src), "1" (dest), "r" (w>>3), "r" (brvec), "r" (contvec)
+                        : "%eax"
+                );
+
+                for (i = w&7; i; i--)
+                {
+                        pel = ((*src++* contrast)>>12) + brightness;
+                        if(pel&768) pel = (-pel)>>31;
+                        *dest++ = pel;
+                }
+
+                src += sstep;
+                dest += dstep;
+        }
+        __asm__ volatile ( "emms \n\t" ::: "memory" );
+}
+#endif
+
+static void process_C(unsigned char *dest, int dstride, unsigned char *src, int sstride,
+                    int w, int h, int brightness, int contrast)
+{
+        int i;
+        int pel;
+        int dstep = dstride-w;
+        int sstep = sstride-w;
+
+        contrast = ((contrast+100)*256*256)/100;
+        brightness = ((brightness+100)*511)/200-128 - contrast/512;
+
+        while (h--) {
+                for (i = w; i; i--)
+                {
+                        pel = ((*src++* contrast)>>16) + brightness;
+                        if(pel&768) pel = (-pel)>>31;
+                        *dest++ = pel;
+                }
+                src += sstep;
+                dest += dstep;
+        }
+}
+
+static void (*process)(unsigned char *dest, int dstride, unsigned char *src, int sstride,
+                       int w, int h, int brightness, int contrast);
+
+/* FIXME: add packed yuv version of process */
+
+static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
+{
+        mp_image_t *dmpi;
+
+        dmpi=ff_vf_get_image(vf->next, mpi->imgfmt,
+                          MP_IMGTYPE_EXPORT, 0,
+                          mpi->w, mpi->h);
+
+        dmpi->stride[0] = mpi->stride[0];
+        dmpi->planes[1] = mpi->planes[1];
+        dmpi->planes[2] = mpi->planes[2];
+        dmpi->stride[1] = mpi->stride[1];
+        dmpi->stride[2] = mpi->stride[2];
+
+        if (!vf->priv->buf) vf->priv->buf = malloc(mpi->stride[0]*mpi->h);
+
+        if ((vf->priv->brightness == 0) && (vf->priv->contrast == 0))
+                dmpi->planes[0] = mpi->planes[0];
+        else {
+                dmpi->planes[0] = vf->priv->buf;
+                process(dmpi->planes[0], dmpi->stride[0],
+                        mpi->planes[0], mpi->stride[0],
+                        mpi->w, mpi->h, vf->priv->brightness,
+                        vf->priv->contrast);
+        }
+
+        return ff_vf_next_put_image(vf,dmpi, pts);
+}
+
+static int control(struct vf_instance *vf, int request, void* data)
+{
+        vf_equalizer_t *eq;
+
+        switch (request) {
+        case VFCTRL_SET_EQUALIZER:
+                eq = data;
+                if (!strcmp(eq->item,"brightness")) {
+                        vf->priv->brightness = eq->value;
+                        return CONTROL_TRUE;
+                }
+                else if (!strcmp(eq->item,"contrast")) {
+                        vf->priv->contrast = eq->value;
+                        return CONTROL_TRUE;
+                }
+                break;
+        case VFCTRL_GET_EQUALIZER:
+                eq = data;
+                if (!strcmp(eq->item,"brightness")) {
+                        eq->value = vf->priv->brightness;
+                        return CONTROL_TRUE;
+                }
+                else if (!strcmp(eq->item,"contrast")) {
+                        eq->value = vf->priv->contrast;
+                        return CONTROL_TRUE;
+                }
+                break;
+        }
+        return ff_vf_next_control(vf, request, data);
+}
+
+static int query_format(struct vf_instance *vf, unsigned int fmt)
+{
+        switch (fmt) {
+        case IMGFMT_YVU9:
+        case IMGFMT_IF09:
+        case IMGFMT_YV12:
+        case IMGFMT_I420:
+        case IMGFMT_IYUV:
+        case IMGFMT_CLPL:
+        case IMGFMT_Y800:
+        case IMGFMT_Y8:
+        case IMGFMT_NV12:
+        case IMGFMT_NV21:
+        case IMGFMT_444P:
+        case IMGFMT_422P:
+        case IMGFMT_411P:
+                return ff_vf_next_query_format(vf, fmt);
+        }
+        return 0;
+}
+
+static void uninit(struct vf_instance *vf)
+{
+        free(vf->priv->buf);
+        free(vf->priv);
+}
+
+static int vf_open(vf_instance_t *vf, char *args)
+{
+        vf->control=control;
+        vf->query_format=query_format;
+        vf->put_image=put_image;
+        vf->uninit=uninit;
+
+    vf->priv = malloc(sizeof(struct vf_priv_s));
+    memset(vf->priv, 0, sizeof(struct vf_priv_s));
+    if (args) sscanf(args, "%d:%d", &vf->priv->brightness, &vf->priv->contrast);
+
+        process = process_C;
+#if HAVE_MMX && HAVE_6REGS
+        if(ff_gCpuCaps.hasMMX) process = process_MMX;
+#endif
+
+        return 1;
+}
+
+const vf_info_t ff_vf_info_eq = {
+        "soft video equalizer",
+        "eq",
+        "Richard Felker",
+        "",
+        vf_open,
+};
diff --git a/libavfilter/libmpcodecs/vf_eq2.c b/libavfilter/libmpcodecs/vf_eq2.c
new file mode 100644
index 0000000000..c9c3ff69f4
--- /dev/null
+++ b/libavfilter/libmpcodecs/vf_eq2.c
@@ -0,0 +1,519 @@
+/*
+ * Software equalizer (brightness, contrast, gamma, saturation)
+ *
+ * Hampa Hug <hampa@hampa.ch> (original LUT gamma/contrast/brightness filter)
+ * Daniel Moreno <comac@comac.darktech.org> (saturation, R/G/B gamma support)
+ * Richard Felker (original MMX contrast/brightness code (vf_eq.c))
+ * Michael Niedermayer <michalni@gmx.at> (LUT16)
+ *
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <inttypes.h>
+
+#include "config.h"
+#include "mp_msg.h"
+#include "cpudetect.h"
+
+#include "img_format.h"
+#include "mp_image.h"
+#include "vf.h"
+
+#define LUT16
+
+/* Per channel parameters */
+typedef struct eq2_param_t {
+  unsigned char lut[256];
+#ifdef LUT16
+  uint16_t lut16[256*256];
+#endif
+  int           lut_clean;
+
+  void (*adjust) (struct eq2_param_t *par, unsigned char *dst, unsigned char *src,
+    unsigned w, unsigned h, unsigned dstride, unsigned sstride);
+
+  double        c;
+  double        b;
+  double        g;
+  double        w;
+} eq2_param_t;
+
+typedef struct vf_priv_s {
+  eq2_param_t param[3];
+
+  double        contrast;
+  double        brightness;
+  double        saturation;
+
+  double        gamma;
+  double        gamma_weight;
+  double        rgamma;
+  double        ggamma;
+  double        bgamma;
+
+  unsigned      buf_w[3];
+  unsigned      buf_h[3];
+  unsigned char *buf[3];
+} vf_eq2_t;
+
+
+static
+void create_lut (eq2_param_t *par)
+{
+  unsigned i;
+  double   g, v;
+  double   lw, gw;
+
+  g = par->g;
+  gw = par->w;
+  lw = 1.0 - gw;
+
+  if ((g < 0.001) || (g > 1000.0)) {
+    g = 1.0;
+  }
+
+  g = 1.0 / g;
+
+  for (i = 0; i < 256; i++) {
+    v = (double) i / 255.0;
+    v = par->c * (v - 0.5) + 0.5 + par->b;
+
+    if (v <= 0.0) {
+      par->lut[i] = 0;
+    }
+    else {
+      v = v*lw + pow(v, g)*gw;
+
+      if (v >= 1.0) {
+        par->lut[i] = 255;
+      }
+      else {
+        par->lut[i] = (unsigned char) (256.0 * v);
+      }
+    }
+  }
+
+#ifdef LUT16
+  for(i=0; i<256*256; i++){
+    par->lut16[i]= par->lut[i&0xFF] + (par->lut[i>>8]<<8);
+  }
+#endif
+
+  par->lut_clean = 1;
+}
+
+#if HAVE_MMX && HAVE_6REGS
+static
+void affine_1d_MMX (eq2_param_t *par, unsigned char *dst, unsigned char *src,
+  unsigned w, unsigned h, unsigned dstride, unsigned sstride)
+{
+  unsigned i;
+  int      contrast, brightness;
+  unsigned dstep, sstep;
+  int      pel;
+  short    brvec[4];
+  short    contvec[4];
+
+//  printf("\nmmx: src=%p dst=%p w=%d h=%d ds=%d ss=%d\n",src,dst,w,h,dstride,sstride);
+
+  contrast = (int) (par->c * 256 * 16);
+  brightness = ((int) (100.0 * par->b + 100.0) * 511) / 200 - 128 - contrast / 32;
+
+  brvec[0] = brvec[1] = brvec[2] = brvec[3] = brightness;
+  contvec[0] = contvec[1] = contvec[2] = contvec[3] = contrast;
+
+  sstep = sstride - w;
+  dstep = dstride - w;
+
+  while (h-- > 0) {
+    __asm__ volatile (
+      "movq (%5), %%mm3 \n\t"
+      "movq (%6), %%mm4 \n\t"
+      "pxor %%mm0, %%mm0 \n\t"
+      "movl %4, %%eax\n\t"
+      ASMALIGN(4)
+      "1: \n\t"
+      "movq (%0), %%mm1 \n\t"
+      "movq (%0), %%mm2 \n\t"
+      "punpcklbw %%mm0, %%mm1 \n\t"
+      "punpckhbw %%mm0, %%mm2 \n\t"
+      "psllw $4, %%mm1 \n\t"
+      "psllw $4, %%mm2 \n\t"
+      "pmulhw %%mm4, %%mm1 \n\t"
+      "pmulhw %%mm4, %%mm2 \n\t"
+      "paddw %%mm3, %%mm1 \n\t"
+      "paddw %%mm3, %%mm2 \n\t"
+      "packuswb %%mm2, %%mm1 \n\t"
+      "add $8, %0 \n\t"
+      "movq %%mm1, (%1) \n\t"
+      "add $8, %1 \n\t"
+      "decl %%eax \n\t"
+      "jnz 1b \n\t"
+      : "=r" (src), "=r" (dst)
+      : "0" (src), "1" (dst), "r" (w >> 3), "r" (brvec), "r" (contvec)
+      : "%eax"
+    );
+
+    for (i = w & 7; i > 0; i--) {
+      pel = ((*src++ * contrast) >> 12) + brightness;
+      if (pel & 768) {
+        pel = (-pel) >> 31;
+      }
+      *dst++ = pel;
+    }
+
+    src += sstep;
+    dst += dstep;
+  }
+
+  __asm__ volatile ( "emms \n\t" ::: "memory" );
+}
+#endif
+
+static
+void apply_lut (eq2_param_t *par, unsigned char *dst, unsigned char *src,
+  unsigned w, unsigned h, unsigned dstride, unsigned sstride)
+{
+  unsigned      i, j, w2;
+  unsigned char *lut;
+  uint16_t *lut16;
+
+  if (!par->lut_clean) {
+    create_lut (par);
+  }
+
+  lut = par->lut;
+#ifdef LUT16
+  lut16 = par->lut16;
+  w2= (w>>3)<<2;
+  for (j = 0; j < h; j++) {
+    uint16_t *src16= (uint16_t*)src;
+    uint16_t *dst16= (uint16_t*)dst;
+    for (i = 0; i < w2; i+=4) {
+      dst16[i+0] = lut16[src16[i+0]];
+      dst16[i+1] = lut16[src16[i+1]];
+      dst16[i+2] = lut16[src16[i+2]];
+      dst16[i+3] = lut16[src16[i+3]];
+    }
+    i <<= 1;
+#else
+  w2= (w>>3)<<3;
+  for (j = 0; j < h; j++) {
+    for (i = 0; i < w2; i+=8) {
+      dst[i+0] = lut[src[i+0]];
+      dst[i+1] = lut[src[i+1]];
+      dst[i+2] = lut[src[i+2]];
+      dst[i+3] = lut[src[i+3]];
+      dst[i+4] = lut[src[i+4]];
+      dst[i+5] = lut[src[i+5]];
+      dst[i+6] = lut[src[i+6]];
+      dst[i+7] = lut[src[i+7]];
+    }
+#endif
+    for (; i < w; i++) {
+      dst[i] = lut[src[i]];
+    }
+
+    src += sstride;
+    dst += dstride;
+  }
+}
+
+static
+int put_image (vf_instance_t *vf, mp_image_t *src, double pts)
+{
+  unsigned      i;
+  vf_eq2_t      *eq2;
+  mp_image_t    *dst;
+  unsigned long img_n,img_c;
+
+  eq2 = vf->priv;
+
+  if ((eq2->buf_w[0] != src->w) || (eq2->buf_h[0] != src->h)) {
+    eq2->buf_w[0] = src->w;
+    eq2->buf_h[0] = src->h;
+      eq2->buf_w[1] = eq2->buf_w[2] = src->w >> src->chroma_x_shift;
+      eq2->buf_h[1] = eq2->buf_h[2] = src->h >> src->chroma_y_shift;
+    img_n = eq2->buf_w[0]*eq2->buf_h[0];
+    if(src->num_planes>1){
+      img_c = eq2->buf_w[1]*eq2->buf_h[1];
+      eq2->buf[0] = realloc (eq2->buf[0], img_n + 2*img_c);
+      eq2->buf[1] = eq2->buf[0] + img_n;
+      eq2->buf[2] = eq2->buf[1] + img_c;
+    } else
+      eq2->buf[0] = realloc (eq2->buf[0], img_n);
+  }
+
+  dst = ff_vf_get_image (vf->next, src->imgfmt, MP_IMGTYPE_EXPORT, 0, src->w, src->h);
+
+  for (i = 0; i < ((src->num_planes>1)?3:1); i++) {
+    if (eq2->param[i].adjust != NULL) {
+      dst->planes[i] = eq2->buf[i];
+      dst->stride[i] = eq2->buf_w[i];
+
+      eq2->param[i].adjust (&eq2->param[i], dst->planes[i], src->planes[i],
+        eq2->buf_w[i], eq2->buf_h[i], dst->stride[i], src->stride[i]);
+    }
+    else {
+      dst->planes[i] = src->planes[i];
+      dst->stride[i] = src->stride[i];
+    }
+  }
+
+  return ff_vf_next_put_image (vf, dst, pts);
+}
+
+static
+void check_values (eq2_param_t *par)
+{
+  /* yuck! floating point comparisons... */
+
+  if ((par->c == 1.0) && (par->b == 0.0) && (par->g == 1.0)) {
+    par->adjust = NULL;
+  }
+#if HAVE_MMX && HAVE_6REGS
+  else if (par->g == 1.0 && ff_gCpuCaps.hasMMX) {
+    par->adjust = &affine_1d_MMX;
+  }
+#endif
+  else {
+    par->adjust = &apply_lut;
+  }
+}
+
+static
+void print_values (vf_eq2_t *eq2)
+{
+  ff_mp_msg (MSGT_VFILTER, MSGL_V, "vf_eq2: c=%.2f b=%.2f g=%.4f s=%.2f \n",
+    eq2->contrast, eq2->brightness, eq2->gamma, eq2->saturation
+  );
+}
+
+static
+void set_contrast (vf_eq2_t *eq2, double c)
+{
+  eq2->contrast = c;
+  eq2->param[0].c = c;
+  eq2->param[0].lut_clean = 0;
+  check_values (&eq2->param[0]);
+  print_values (eq2);
+}
+
+static
+void set_brightness (vf_eq2_t *eq2, double b)
+{
+  eq2->brightness = b;
+  eq2->param[0].b = b;
+  eq2->param[0].lut_clean = 0;
+  check_values (&eq2->param[0]);
+  print_values (eq2);
+}
+
+static
+void set_gamma (vf_eq2_t *eq2, double g)
+{
+  eq2->gamma = g;
+
+  eq2->param[0].g = eq2->gamma * eq2->ggamma;
+  eq2->param[1].g = sqrt (eq2->bgamma / eq2->ggamma);
+  eq2->param[2].g = sqrt (eq2->rgamma / eq2->ggamma);
+  eq2->param[0].w = eq2->param[1].w = eq2->param[2].w = eq2->gamma_weight;
+
+  eq2->param[0].lut_clean = 0;
+  eq2->param[1].lut_clean = 0;
+  eq2->param[2].lut_clean = 0;
+
+  check_values (&eq2->param[0]);
+  check_values (&eq2->param[1]);
+  check_values (&eq2->param[2]);
+
+  print_values (eq2);
+}
+
+static
+void set_saturation (vf_eq2_t *eq2, double s)
+{
+  eq2->saturation = s;
+
+  eq2->param[1].c = s;
+  eq2->param[2].c = s;
+
+  eq2->param[1].lut_clean = 0;
+  eq2->param[2].lut_clean = 0;
+
+  check_values (&eq2->param[1]);
+  check_values (&eq2->param[2]);
+
+  print_values (eq2);
+}
+
+static
+int control (vf_instance_t *vf, int request, void *data)
+{
+  vf_equalizer_t *eq;
+
+  switch (request) {
+    case VFCTRL_SET_EQUALIZER:
+      eq = (vf_equalizer_t *) data;
+
+      if (strcmp (eq->item, "gamma") == 0) {
+        set_gamma (vf->priv, exp (log (8.0) * eq->value / 100.0));
+        return CONTROL_TRUE;
+      }
+      else if (strcmp (eq->item, "contrast") == 0) {
+        set_contrast (vf->priv, (1.0 / 100.0) * (eq->value + 100));
+        return CONTROL_TRUE;
+      }
+      else if (strcmp (eq->item, "brightness") == 0) {
+        set_brightness (vf->priv, (1.0 / 100.0) * eq->value);
+        return CONTROL_TRUE;
+      }
+      else if (strcmp (eq->item, "saturation") == 0) {
+        set_saturation (vf->priv, (double) (eq->value + 100) / 100.0);
+        return CONTROL_TRUE;
+      }
+      break;
+
+    case VFCTRL_GET_EQUALIZER:
+      eq = (vf_equalizer_t *) data;
+      if (strcmp (eq->item, "gamma") == 0) {
+        eq->value = (int) (100.0 * log (vf->priv->gamma) / log (8.0));
+        return CONTROL_TRUE;
+      }
+      else if (strcmp (eq->item, "contrast") == 0) {
+        eq->value = (int) (100.0 * vf->priv->contrast) - 100;
+        return CONTROL_TRUE;
+      }
+      else if (strcmp (eq->item, "brightness") == 0) {
+        eq->value = (int) (100.0 * vf->priv->brightness);
+        return CONTROL_TRUE;
+      }
+      else if (strcmp (eq->item, "saturation") == 0) {
+        eq->value = (int) (100.0 * vf->priv->saturation) - 100;
+        return CONTROL_TRUE;
+      }
+      break;
+  }
+
+  return ff_vf_next_control (vf, request, data);
+}
+
+static
+int query_format (vf_instance_t *vf, unsigned fmt)
+{
+  switch (fmt) {
+    case IMGFMT_YVU9:
+    case IMGFMT_IF09:
+    case IMGFMT_YV12:
+    case IMGFMT_I420:
+    case IMGFMT_IYUV:
+    case IMGFMT_Y800:
+    case IMGFMT_Y8:
+    case IMGFMT_444P:
+    case IMGFMT_422P:
+    case IMGFMT_411P:
+      return ff_vf_next_query_format (vf, fmt);
+  }
+
+  return 0;
+}
+
+static
+void uninit (vf_instance_t *vf)
+{
+  if (vf->priv != NULL) {
+    free (vf->priv->buf[0]);
+    free (vf->priv);
+  }
+}
+
+static
+int vf_open(vf_instance_t *vf, char *args)
+{
+  unsigned i;
+  vf_eq2_t *eq2;
+  double   par[8];
+
+  vf->control = control;
+  vf->query_format = query_format;
+  vf->put_image = put_image;
+  vf->uninit = uninit;
+
+  vf->priv = malloc (sizeof (vf_eq2_t));
+  eq2 = vf->priv;
+
+  for (i = 0; i < 3; i++) {
+    eq2->buf[i] = NULL;
+    eq2->buf_w[i] = 0;
+    eq2->buf_h[i] = 0;
+
+    eq2->param[i].adjust = NULL;
+    eq2->param[i].c = 1.0;
+    eq2->param[i].b = 0.0;
+    eq2->param[i].g = 1.0;
+    eq2->param[i].lut_clean = 0;
+  }
+
+  eq2->contrast = 1.0;
+  eq2->brightness = 0.0;
+  eq2->saturation = 1.0;
+
+  eq2->gamma = 1.0;
+  eq2->gamma_weight = 1.0;
+  eq2->rgamma = 1.0;
+  eq2->ggamma = 1.0;
+  eq2->bgamma = 1.0;
+
+  if (args != NULL) {
+    par[0] = 1.0;
+    par[1] = 1.0;
+    par[2] = 0.0;
+    par[3] = 1.0;
+    par[4] = 1.0;
+    par[5] = 1.0;
+    par[6] = 1.0;
+    par[7] = 1.0;
+    sscanf (args, "%lf:%lf:%lf:%lf:%lf:%lf:%lf:%lf",
+      par, par + 1, par + 2, par + 3, par + 4, par + 5, par + 6, par + 7
+    );
+
+    eq2->rgamma = par[4];
+    eq2->ggamma = par[5];
+    eq2->bgamma = par[6];
+    eq2->gamma_weight = par[7];
+
+    set_gamma (eq2, par[0]);
+    set_contrast (eq2, par[1]);
+    set_brightness (eq2, par[2]);
+    set_saturation (eq2, par[3]);
+  }
+
+  return 1;
+}
+
+const vf_info_t ff_vf_info_eq2 = {
+  "Software equalizer",
+  "eq2",
+  "Hampa Hug, Daniel Moreno, Richard Felker",
+  "",
+  &vf_open,
+  NULL
+};
diff --git a/libavfilter/libmpcodecs/vf_fspp.c b/libavfilter/libmpcodecs/vf_fspp.c
new file mode 100644
index 0000000000..e1b26bc625
--- /dev/null
+++ b/libavfilter/libmpcodecs/vf_fspp.c
@@ -0,0 +1,2125 @@
+/*
+ * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
+ *
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * This implementation is based on an algorithm described in
+ * "Aria Nosratinia Embedded Post-Processing for
+ * Enhancement of Compressed Images (1999)"
+ * (http://citeseer.nj.nec.com/nosratinia99embedded.html)
+ * Further, with splitting (i)dct into hor/ver passes, one of them can be
+ * performed once per block, not pixel. This allows for much better speed.
+ */
+
+/*
+  Heavily optimized version of SPP filter by Nikolaj
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <math.h>
+
+#include "config.h"
+
+#include "mp_msg.h"
+#include "cpudetect.h"
+#include "img_format.h"
+#include "mp_image.h"
+#include "vf.h"
+#include "av_helpers.h"
+#include "libvo/fastmemcpy.h"
+
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+
+#undef free
+#undef malloc
+
+//===========================================================================//
+#define BLOCKSZ 12
+
+static const short custom_threshold[64]=
+// values (296) can't be too high
+// -it causes too big quant dependence
+// or maybe overflow(check), which results in some flashing
+{ 71, 296, 295, 237,  71,  40,  38,  19,
+  245, 193, 185, 121, 102,  73,  53,  27,
+  158, 129, 141, 107,  97,  73,  50,  26,
+  102, 116, 109,  98,  82,  66,  45,  23,
+  71,  94,  95,  81,  70,  56,  38,  20,
+  56,  77,  74,  66,  56,  44,  30,  15,
+  38,  53,  50,  45,  38,  30,  21,  11,
+  20,  27,  26,  23,  20,  15,  11,   5
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
+    {  0,  48,  12,  60,   3,  51,  15,  63, },
+    { 32,  16,  44,  28,  35,  19,  47,  31, },
+    {  8,  56,   4,  52,  11,  59,   7,  55, },
+    { 40,  24,  36,  20,  43,  27,  39,  23, },
+    {  2,  50,  14,  62,   1,  49,  13,  61, },
+    { 34,  18,  46,  30,  33,  17,  45,  29, },
+    { 10,  58,   6,  54,   9,  57,   5,  53, },
+    { 42,  26,  38,  22,  41,  25,  37,  21, },
+};
+
+struct vf_priv_s { //align 16 !
+    uint64_t threshold_mtx_noq[8*2];
+    uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions
+
+    int log2_count;
+    int temp_stride;
+    int qp;
+    int mpeg2;
+    int prev_q;
+    uint8_t *src;
+    int16_t *temp;
+    int bframes;
+    char *non_b_qp;
+};
+
+
+#if !HAVE_MMX
+
+//This func reads from 1 slice, 1 and clears 0 & 1
+static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
+{int y, x;
+#define STORE(pos)                                                        \
+    temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale);        \
+    src[x + pos]=src[x + pos - 8*src_stride]=0;                                \
+    if(temp & 0x100) temp= ~(temp>>31);                                        \
+    dst[x + pos]= temp;
+
+    for(y=0; y<height; y++){
+        const uint8_t *d= dither[y];
+        for(x=0; x<width; x+=8){
+            int temp;
+            STORE(0);
+            STORE(1);
+            STORE(2);
+            STORE(3);
+            STORE(4);
+            STORE(5);
+            STORE(6);
+            STORE(7);
+        }
+        src+=src_stride;
+        dst+=dst_stride;
+    }
+}
+
+//This func reads from 2 slices, 0 & 2  and clears 2-nd
+static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
+{int y, x;
+#define STORE2(pos)                                                        \
+    temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale);        \
+    src[x + pos + 16*src_stride]=0;                                        \
+    if(temp & 0x100) temp= ~(temp>>31);                                        \
+    dst[x + pos]= temp;
+
+    for(y=0; y<height; y++){
+        const uint8_t *d= dither[y];
+        for(x=0; x<width; x+=8){
+            int temp;
+            STORE2(0);
+            STORE2(1);
+            STORE2(2);
+            STORE2(3);
+            STORE2(4);
+            STORE2(5);
+            STORE2(6);
+            STORE2(7);
+        }
+        src+=src_stride;
+        dst+=dst_stride;
+    }
+}
+
+static void mul_thrmat_c(struct vf_priv_s *p,int q)
+{
+    int a;
+    for(a=0;a<64;a++)
+        ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C
+}
+
+static void column_fidct_c(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt);
+static void row_idct_c(int16_t* workspace,
+                       int16_t* output_adr, int output_stride, int cnt);
+static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt);
+
+//this is rather ugly, but there is no need for function pointers
+#define store_slice_s store_slice_c
+#define store_slice2_s store_slice2_c
+#define mul_thrmat_s mul_thrmat_c
+#define column_fidct_s column_fidct_c
+#define row_idct_s row_idct_c
+#define row_fdct_s row_fdct_c
+
+#else /* HAVE_MMX */
+
+//This func reads from 1 slice, 1 and clears 0 & 1
+static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
+{
+    const uint8_t *od=&dither[0][0];
+    const uint8_t *end=&dither[height][0];
+    width = (width+7)&~7;
+    dst_stride-=width;
+    //src_stride=(src_stride-width)*2;
+    __asm__ volatile(
+        "mov %5, %%"REG_d"                \n\t"
+        "mov %6, %%"REG_S"                \n\t"
+        "mov %7, %%"REG_D"                \n\t"
+        "mov %1, %%"REG_a"                \n\t"
+        "movd %%"REG_d", %%mm5             \n\t"
+        "xor $-1, %%"REG_d"              \n\t"
+        "mov %%"REG_a", %%"REG_c"             \n\t"
+        "add $7, %%"REG_d"               \n\t"
+        "neg %%"REG_a"                   \n\t"
+        "sub %0, %%"REG_c"            \n\t"
+        "add %%"REG_c", %%"REG_c"             \n\t"
+        "movd %%"REG_d", %%mm2             \n\t"
+        "mov %%"REG_c", %1       \n\t"
+        "mov %2, %%"REG_d"               \n\t"
+        "shl $4, %%"REG_a"               \n\t"
+
+        "2:                        \n\t"
+        "movq (%%"REG_d"), %%mm3           \n\t"
+        "movq %%mm3, %%mm4             \n\t"
+        "pxor %%mm7, %%mm7             \n\t"
+        "punpcklbw %%mm7, %%mm3        \n\t"
+        "punpckhbw %%mm7, %%mm4        \n\t"
+        "mov %0, %%"REG_c"            \n\t"
+        "psraw %%mm5, %%mm3            \n\t"
+        "psraw %%mm5, %%mm4            \n\t"
+        "1:                        \n\t"
+        "movq %%mm7, (%%"REG_S",%%"REG_a")     \n\t"
+        "movq (%%"REG_S"), %%mm0           \n\t"
+        "movq 8(%%"REG_S"), %%mm1          \n\t"
+
+        "movq %%mm7, 8(%%"REG_S",%%"REG_a")    \n\t"
+        "paddw %%mm3, %%mm0            \n\t"
+        "paddw %%mm4, %%mm1            \n\t"
+
+        "movq %%mm7, (%%"REG_S")           \n\t"
+        "psraw %%mm2, %%mm0            \n\t"
+        "psraw %%mm2, %%mm1            \n\t"
+
+        "movq %%mm7, 8(%%"REG_S")          \n\t"
+        "packuswb %%mm1, %%mm0         \n\t"
+        "add $16, %%"REG_S"              \n\t"
+
+        "movq %%mm0, (%%"REG_D")           \n\t"
+        "add $8, %%"REG_D"               \n\t"
+        "sub $8, %%"REG_c"               \n\t"
+        "jg 1b                      \n\t"
+        "add %1, %%"REG_S"       \n\t"
+        "add $8, %%"REG_d"               \n\t"
+        "add %3, %%"REG_D"       \n\t"
+        "cmp %4, %%"REG_d"           \n\t"
+        "jl 2b                      \n\t"
+
+        :
+        : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
+          "m" (log2_scale), "m" (src), "m" (dst) //input
+        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
+        );
+}
+
+//This func reads from 2 slices, 0 & 2  and clears 2-nd
+static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
+{
+    const uint8_t *od=&dither[0][0];
+    const uint8_t *end=&dither[height][0];
+    width = (width+7)&~7;
+    dst_stride-=width;
+    //src_stride=(src_stride-width)*2;
+    __asm__ volatile(
+        "mov %5, %%"REG_d"                \n\t"
+        "mov %6, %%"REG_S"                \n\t"
+        "mov %7, %%"REG_D"                \n\t"
+        "mov %1, %%"REG_a"            \n\t"
+        "movd %%"REG_d", %%mm5             \n\t"
+        "xor $-1, %%"REG_d"              \n\t"
+        "mov %%"REG_a", %%"REG_c"             \n\t"
+        "add $7, %%"REG_d"               \n\t"
+        "sub %0, %%"REG_c"            \n\t"
+        "add %%"REG_c", %%"REG_c"             \n\t"
+        "movd %%"REG_d", %%mm2             \n\t"
+        "mov %%"REG_c", %1       \n\t"
+        "mov %2, %%"REG_d"               \n\t"
+        "shl $5, %%"REG_a"               \n\t"
+
+        "2:                        \n\t"
+        "movq (%%"REG_d"), %%mm3           \n\t"
+        "movq %%mm3, %%mm4             \n\t"
+        "pxor %%mm7, %%mm7             \n\t"
+        "punpcklbw %%mm7, %%mm3        \n\t"
+        "punpckhbw %%mm7, %%mm4        \n\t"
+        "mov %0, %%"REG_c"            \n\t"
+        "psraw %%mm5, %%mm3            \n\t"
+        "psraw %%mm5, %%mm4            \n\t"
+        "1:                        \n\t"
+        "movq (%%"REG_S"), %%mm0           \n\t"
+        "movq 8(%%"REG_S"), %%mm1          \n\t"
+        "paddw %%mm3, %%mm0            \n\t"
+
+        "paddw (%%"REG_S",%%"REG_a"), %%mm0    \n\t"
+        "paddw %%mm4, %%mm1            \n\t"
+        "movq 8(%%"REG_S",%%"REG_a"), %%mm6    \n\t"
+
+        "movq %%mm7, (%%"REG_S",%%"REG_a")     \n\t"
+        "psraw %%mm2, %%mm0            \n\t"
+        "paddw %%mm6, %%mm1            \n\t"
+
+        "movq %%mm7, 8(%%"REG_S",%%"REG_a")    \n\t"
+        "psraw %%mm2, %%mm1            \n\t"
+        "packuswb %%mm1, %%mm0         \n\t"
+
+        "movq %%mm0, (%%"REG_D")           \n\t"
+        "add $16, %%"REG_S"              \n\t"
+        "add $8, %%"REG_D"               \n\t"
+        "sub $8, %%"REG_c"               \n\t"
+        "jg 1b                      \n\t"
+        "add %1, %%"REG_S"       \n\t"
+        "add $8, %%"REG_d"               \n\t"
+        "add %3, %%"REG_D"       \n\t"
+        "cmp %4, %%"REG_d"           \n\t"
+        "jl 2b                      \n\t"
+
+        :
+        : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
+          "m" (log2_scale), "m" (src), "m" (dst) //input
+        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
+        );
+}
+
+static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
+{
+    uint64_t *adr=&p->threshold_mtx_noq[0];
+    __asm__ volatile(
+        "movd %0, %%mm7                \n\t"
+        "add $8*8*2, %%"REG_D"            \n\t"
+        "movq 0*8(%%"REG_S"), %%mm0        \n\t"
+        "punpcklwd %%mm7, %%mm7        \n\t"
+        "movq 1*8(%%"REG_S"), %%mm1        \n\t"
+        "punpckldq %%mm7, %%mm7        \n\t"
+        "pmullw %%mm7, %%mm0           \n\t"
+
+        "movq 2*8(%%"REG_S"), %%mm2        \n\t"
+        "pmullw %%mm7, %%mm1           \n\t"
+
+        "movq 3*8(%%"REG_S"), %%mm3        \n\t"
+        "pmullw %%mm7, %%mm2           \n\t"
+
+        "movq %%mm0, 0*8(%%"REG_D")        \n\t"
+        "movq 4*8(%%"REG_S"), %%mm4        \n\t"
+        "pmullw %%mm7, %%mm3           \n\t"
+
+        "movq %%mm1, 1*8(%%"REG_D")        \n\t"
+        "movq 5*8(%%"REG_S"), %%mm5        \n\t"
+        "pmullw %%mm7, %%mm4           \n\t"
+
+        "movq %%mm2, 2*8(%%"REG_D")        \n\t"
+        "movq 6*8(%%"REG_S"), %%mm6        \n\t"
+        "pmullw %%mm7, %%mm5           \n\t"
+
+        "movq %%mm3, 3*8(%%"REG_D")        \n\t"
+        "movq 7*8+0*8(%%"REG_S"), %%mm0    \n\t"
+        "pmullw %%mm7, %%mm6           \n\t"
+
+        "movq %%mm4, 4*8(%%"REG_D")        \n\t"
+        "movq 7*8+1*8(%%"REG_S"), %%mm1    \n\t"
+        "pmullw %%mm7, %%mm0           \n\t"
+
+        "movq %%mm5, 5*8(%%"REG_D")        \n\t"
+        "movq 7*8+2*8(%%"REG_S"), %%mm2    \n\t"
+        "pmullw %%mm7, %%mm1           \n\t"
+
+        "movq %%mm6, 6*8(%%"REG_D")        \n\t"
+        "movq 7*8+3*8(%%"REG_S"), %%mm3    \n\t"
+        "pmullw %%mm7, %%mm2           \n\t"
+
+        "movq %%mm0, 7*8+0*8(%%"REG_D")    \n\t"
+        "movq 7*8+4*8(%%"REG_S"), %%mm4    \n\t"
+        "pmullw %%mm7, %%mm3           \n\t"
+
+        "movq %%mm1, 7*8+1*8(%%"REG_D")    \n\t"
+        "movq 7*8+5*8(%%"REG_S"), %%mm5    \n\t"
+        "pmullw %%mm7, %%mm4           \n\t"
+
+        "movq %%mm2, 7*8+2*8(%%"REG_D")    \n\t"
+        "movq 7*8+6*8(%%"REG_S"), %%mm6    \n\t"
+        "pmullw %%mm7, %%mm5           \n\t"
+
+        "movq %%mm3, 7*8+3*8(%%"REG_D")    \n\t"
+        "movq 14*8+0*8(%%"REG_S"), %%mm0   \n\t"
+        "pmullw %%mm7, %%mm6           \n\t"
+
+        "movq %%mm4, 7*8+4*8(%%"REG_D")    \n\t"
+        "movq 14*8+1*8(%%"REG_S"), %%mm1   \n\t"
+        "pmullw %%mm7, %%mm0           \n\t"
+
+        "movq %%mm5, 7*8+5*8(%%"REG_D")    \n\t"
+        "pmullw %%mm7, %%mm1           \n\t"
+
+        "movq %%mm6, 7*8+6*8(%%"REG_D")    \n\t"
+        "movq %%mm0, 14*8+0*8(%%"REG_D")   \n\t"
+        "movq %%mm1, 14*8+1*8(%%"REG_D")   \n\t"
+
+        : "+g" (q), "+S" (adr), "+D" (adr)
+        :
+        );
+}
+
+static void column_fidct_mmx(int16_t* thr_adr,  int16_t *data,  int16_t *output,  int cnt);
+static void row_idct_mmx(int16_t* workspace,
+                         int16_t* output_adr,  int output_stride,  int cnt);
+static void row_fdct_mmx(int16_t *data,  const uint8_t *pixels,  int line_size,  int cnt);
+
+#define store_slice_s store_slice_mmx
+#define store_slice2_s store_slice2_mmx
+#define mul_thrmat_s mul_thrmat_mmx
+#define column_fidct_s column_fidct_mmx
+#define row_idct_s row_idct_mmx
+#define row_fdct_s row_fdct_mmx
+#endif // HAVE_MMX
+
+static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,
+                   int dst_stride, int src_stride,
+                   int width, int height,
+                   uint8_t *qp_store, int qp_stride, int is_luma)
+{
+    int x, x0, y, es, qy, t;
+    const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15))
+    const int step=6-p->log2_count;
+    const int qps= 3 + is_luma;
+    DECLARE_ALIGNED(32, int32_t, block_align)[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
+    int16_t *block= (int16_t *)block_align;
+    int16_t *block3=(int16_t *)(block_align+4*8*BLOCKSZ);
+
+    memset(block3, 0, 4*8*BLOCKSZ);
+
+    //p->src=src-src_stride*8-8;//!
+    if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
+    for(y=0; y<height; y++){
+        int index= 8 + 8*stride + y*stride;
+        fast_memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers
+        for(x=0; x<8; x++){
+            p->src[index         - x - 1]= p->src[index +         x    ];
+            p->src[index + width + x    ]= p->src[index + width - x - 1];
+        }
+    }
+    for(y=0; y<8; y++){
+        fast_memcpy(p->src + (      7-y)*stride, p->src + (      y+8)*stride, stride);
+        fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
+    }
+    //FIXME (try edge emu)
+
+    for(y=8; y<24; y++)
+        memset(p->temp+ 8 +y*stride, 0,width*sizeof(int16_t));
+
+    for(y=step; y<height+8; y+=step){    //step= 1,2
+        qy=y-4;
+        if (qy>height-1) qy=height-1;
+        if (qy<0) qy=0;
+        qy=(qy>>qps)*qp_stride;
+        row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2);
+        for(x0=0; x0<width+8-8*(BLOCKSZ-1); x0+=8*(BLOCKSZ-1)){
+            row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1));
+            if(p->qp)
+                column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1)); //yes, this is a HOTSPOT
+            else
+                for (x=0; x<8*(BLOCKSZ-1); x+=8) {
+                    t=x+x0-2; //correct t=x+x0-2-(y&1), but its the same
+                    if (t<0) t=0;//t always < width-2
+                    t=qp_store[qy+(t>>qps)];
+                    t=norm_qscale(t, p->mpeg2);
+                    if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(p, t);
+                    column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); //yes, this is a HOTSPOT
+                }
+            row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
+            memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(int16_t)); //cycling
+            memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(int16_t));
+        }
+        //
+        es=width+8-x0; //  8, ...
+        if (es>8)
+            row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2);
+        column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es&(~1));
+        row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2);
+        {const int y1=y-8+step;//l5-7  l4-6
+            if (!(y1&7) && y1) {
+                if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride,
+                                        dst_stride, stride, width, 8, 5-p->log2_count);
+                else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride,
+                                    dst_stride, stride, width, 8, 5-p->log2_count);
+            } }
+    }
+
+    if (y&7) {  // == height & 7
+        if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride,
+                               dst_stride, stride, width, y&7, 5-p->log2_count);
+        else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride,
+                            dst_stride, stride, width, y&7, 5-p->log2_count);
+    }
+}
+
+static int config(struct vf_instance *vf,
+                  int width, int height, int d_width, int d_height,
+                  unsigned int flags, unsigned int outfmt)
+{
+    int h= (height+16+15)&(~15);
+
+    vf->priv->temp_stride= (width+16+15)&(~15);
+    vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t));
+    //this can also be avoided, see above
+    vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
+
+    return ff_vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
+}
+
+static void get_image(struct vf_instance *vf, mp_image_t *mpi)
+{
+    if(mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
+    // ok, we can do pp in-place (or pp disabled):
+    vf->dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
+                          mpi->type, mpi->flags, mpi->width, mpi->height);
+    mpi->planes[0]=vf->dmpi->planes[0];
+    mpi->stride[0]=vf->dmpi->stride[0];
+    mpi->width=vf->dmpi->width;
+    if(mpi->flags&MP_IMGFLAG_PLANAR){
+        mpi->planes[1]=vf->dmpi->planes[1];
+        mpi->planes[2]=vf->dmpi->planes[2];
+        mpi->stride[1]=vf->dmpi->stride[1];
+        mpi->stride[2]=vf->dmpi->stride[2];
+    }
+    mpi->flags|=MP_IMGFLAG_DIRECT;
+}
+
+static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
+{
+    mp_image_t *dmpi;
+    if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
+        // no DR, so get a new image! hope we'll get DR buffer:
+        dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
+                          MP_IMGTYPE_TEMP,
+                          MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
+                          mpi->width,mpi->height);
+        ff_vf_clone_mpi_attributes(dmpi, mpi);
+    }else{
+        dmpi=vf->dmpi;
+    }
+
+    vf->priv->mpeg2= mpi->qscale_type;
+    if(mpi->pict_type != 3 && mpi->qscale && !vf->priv->qp){
+        int w = mpi->qstride;
+        int h = (mpi->h + 15) >> 4;
+        if (!w) {
+            w = (mpi->w + 15) >> 4;
+            h = 1;
+        }
+        if(!vf->priv->non_b_qp)
+            vf->priv->non_b_qp= malloc(w*h);
+        fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
+    }
+    if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
+        char *qp_tab= vf->priv->non_b_qp;
+        if(vf->priv->bframes || !qp_tab)
+            qp_tab= mpi->qscale;
+
+        if(qp_tab || vf->priv->qp){
+            filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0],
+                   mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
+            filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1],
+                   mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
+            filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2],
+                   mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
+        }else{
+            memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
+            memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
+            memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
+        }
+    }
+
+#if HAVE_MMX
+    if(ff_gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
+#endif
+#if HAVE_MMX2
+    if(ff_gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
+#endif
+    return ff_vf_next_put_image(vf,dmpi, pts);
+}
+
+static void uninit(struct vf_instance *vf)
+{
+    if(!vf->priv) return;
+
+    av_free(vf->priv->temp);
+    vf->priv->temp= NULL;
+    av_free(vf->priv->src);
+    vf->priv->src= NULL;
+    //free(vf->priv->avctx);
+    //vf->priv->avctx= NULL;
+    free(vf->priv->non_b_qp);
+    vf->priv->non_b_qp= NULL;
+
+    av_free(vf->priv);
+    vf->priv=NULL;
+}
+
+//===========================================================================//
+
+static int query_format(struct vf_instance *vf, unsigned int fmt)
+{
+    switch(fmt){
+    case IMGFMT_YVU9:
+    case IMGFMT_IF09:
+    case IMGFMT_YV12:
+    case IMGFMT_I420:
+    case IMGFMT_IYUV:
+    case IMGFMT_CLPL:
+    case IMGFMT_Y800:
+    case IMGFMT_Y8:
+    case IMGFMT_444P:
+    case IMGFMT_422P:
+    case IMGFMT_411P:
+        return ff_vf_next_query_format(vf,fmt);
+    }
+    return 0;
+}
+
+static int control(struct vf_instance *vf, int request, void* data)
+{
+    switch(request){
+    case VFCTRL_QUERY_MAX_PP_LEVEL:
+        return 5;
+    case VFCTRL_SET_PP_LEVEL:
+        vf->priv->log2_count= *((unsigned int*)data);
+        if (vf->priv->log2_count < 4) vf->priv->log2_count=4;
+        return CONTROL_TRUE;
+    }
+    return ff_vf_next_control(vf,request,data);
+}
+
+static int vf_open(vf_instance_t *vf, char *args)
+{
+    int i=0, bias;
+    int custom_threshold_m[64];
+    int log2c=-1;
+
+    vf->config=config;
+    vf->put_image=put_image;
+    vf->get_image=get_image;
+    vf->query_format=query_format;
+    vf->uninit=uninit;
+    vf->control= control;
+    vf->priv=av_mallocz(sizeof(struct vf_priv_s));//assumes align 16 !
+
+    ff_init_avcodec();
+
+    //vf->priv->avctx= avcodec_alloc_context();
+    //dsputil_init(&vf->priv->dsp, vf->priv->avctx);
+
+    vf->priv->log2_count= 4;
+    vf->priv->bframes = 0;
+
+    if (args) sscanf(args, "%d:%d:%d:%d", &log2c, &vf->priv->qp, &i, &vf->priv->bframes);
+
+    if( log2c >=4 && log2c <=5 )
+        vf->priv->log2_count = log2c;
+    else if( log2c >= 6 )
+        vf->priv->log2_count = 5;
+
+    if(vf->priv->qp < 0)
+        vf->priv->qp = 0;
+
+    if (i < -15) i = -15;
+    if (i > 32) i = 32;
+
+    bias= (1<<4)+i; //regulable
+    vf->priv->prev_q=0;
+    //
+    for(i=0;i<64;i++) //FIXME: tune custom_threshold[] and remove this !
+        custom_threshold_m[i]=(int)(custom_threshold[i]*(bias/71.)+ 0.5);
+    for(i=0;i<8;i++){
+        vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2]
+            |(((uint64_t)custom_threshold_m[i*8+6])<<16)
+            |(((uint64_t)custom_threshold_m[i*8+0])<<32)
+            |(((uint64_t)custom_threshold_m[i*8+4])<<48);
+        vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5]
+            |(((uint64_t)custom_threshold_m[i*8+3])<<16)
+            |(((uint64_t)custom_threshold_m[i*8+1])<<32)
+            |(((uint64_t)custom_threshold_m[i*8+7])<<48);
+    }
+
+    if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv, vf->priv->qp);
+
+    return 1;
+}
+
+const vf_info_t ff_vf_info_fspp = {
+    "fast simple postprocess",
+    "fspp",
+    "Michael Niedermayer, Nikolaj Poroshin",
+    "",
+    vf_open,
+    NULL
+};
+
+//====================================================================
+//Specific spp's dct, idct and threshold functions
+//I'd prefer to have them in the separate file.
+
+//#define MANGLE(a) #a
+
+//typedef int16_t int16_t; //! only int16_t
+
+#define DCTSIZE 8
+#define DCTSIZE_S "8"
+
+#define FIX(x,s)  ((int) ((x) * (1<<s) + 0.5)&0xffff)
+#define C64(x)    ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
+#define FIX64(x,s)  C64(FIX(x,s))
+
+#define MULTIPLY16H(x,k)   (((x)*(k))>>16)
+#define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
+#define DESCALE(x,n)  (((x) + (1 << ((n)-1))) >> n)
+
+#if HAVE_MMX
+
+DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)=FIX64(0.382683433, 14);
+DECLARE_ALIGNED(8, uint64_t, ff_MM_FIX_0_541196100)=FIX64(0.541196100, 14);
+DECLARE_ALIGNED(8, uint64_t, ff_MM_FIX_0_707106781)=FIX64(0.707106781, 14);
+DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)=FIX64(1.306562965, 14);
+
+DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A)=FIX64(1.414213562, 14);
+
+DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)=FIX64(1.847759065, 13);
+DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)=FIX64(-2.613125930, 13); //-
+DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)=FIX64(1.414213562, 13);
+DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)=FIX64(1.082392200, 13);
+//for t3,t5,t7 == 0 shortcut
+DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)=FIX64(0.847759065, 14);
+DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)=FIX64(0.566454497, 14);
+DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)=FIX64(0.198912367, 14);
+
+DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)=C64(4);
+DECLARE_ASM_CONST(8, uint64_t, MM_2)=C64(2);
+
+#else /* !HAVE_MMX */
+
+typedef int32_t int_simd16_t;
+static const int16_t FIX_0_382683433=FIX(0.382683433, 14);
+static const int16_t FIX_0_541196100=FIX(0.541196100, 14);
+static const int16_t FIX_0_707106781=FIX(0.707106781, 14);
+static const int16_t FIX_1_306562965=FIX(1.306562965, 14);
+static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14);
+static const int16_t FIX_1_847759065=FIX(1.847759065, 13);
+static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); //-
+static const int16_t FIX_1_414213562=FIX(1.414213562, 13);
+static const int16_t FIX_1_082392200=FIX(1.082392200, 13);
+
+#endif
+
+#if !HAVE_MMX
+
+static void column_fidct_c(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
+    int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
+
+    int16_t* dataptr;
+    int16_t* wsptr;
+    int16_t *threshold;
+    int ctr;
+
+    dataptr = data;
+    wsptr = output;
+
+    for (; cnt > 0; cnt-=2) { //start positions
+        threshold=(int16_t*)thr_adr;//threshold_mtx
+        for (ctr = DCTSIZE; ctr > 0; ctr--) {
+            // Process columns from input, add to output.
+            tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
+            tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
+
+            tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
+            tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
+
+            tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
+            tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
+
+            tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
+            tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+
+            // Even part of FDCT
+
+            tmp10 = tmp0 + tmp3;
+            tmp13 = tmp0 - tmp3;
+            tmp11 = tmp1 + tmp2;
+            tmp12 = tmp1 - tmp2;
+
+            d0 = tmp10 + tmp11;
+            d4 = tmp10 - tmp11;
+
+            z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781);
+            d2 = tmp13 + z1;
+            d6 = tmp13 - z1;
+
+            // Even part of IDCT
+
+            THRESHOLD(tmp0, d0, threshold[0*8]);
+            THRESHOLD(tmp1, d2, threshold[2*8]);
+            THRESHOLD(tmp2, d4, threshold[4*8]);
+            THRESHOLD(tmp3, d6, threshold[6*8]);
+            tmp0+=2;
+            tmp10 = (tmp0 + tmp2)>>2;
+            tmp11 = (tmp0 - tmp2)>>2;
+
+            tmp13 = (tmp1 + tmp3)>>2; //+2 !  (psnr decides)
+            tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
+
+            tmp0 = tmp10 + tmp13; //->temps
+            tmp3 = tmp10 - tmp13; //->temps
+            tmp1 = tmp11 + tmp12; //->temps
+            tmp2 = tmp11 - tmp12; //->temps
+
+            // Odd part of FDCT
+
+            tmp10 = tmp4 + tmp5;
+            tmp11 = tmp5 + tmp6;
+            tmp12 = tmp6 + tmp7;
+
+            z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433);
+            z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5;
+            z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5;
+            z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781);
+
+            z11 = tmp7 + z3;
+            z13 = tmp7 - z3;
+
+            d5 = z13 + z2;
+            d3 = z13 - z2;
+            d1 = z11 + z4;
+            d7 = z11 - z4;
+
+            // Odd part of IDCT
+
+            THRESHOLD(tmp4, d1, threshold[1*8]);
+            THRESHOLD(tmp5, d3, threshold[3*8]);
+            THRESHOLD(tmp6, d5, threshold[5*8]);
+            THRESHOLD(tmp7, d7, threshold[7*8]);
+
+            //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
+            z13 = tmp6 + tmp5;
+            z10 = (tmp6 - tmp5)<<1;
+            z11 = tmp4 + tmp7;
+            z12 = (tmp4 - tmp7)<<1;
+
+            tmp7 = (z11 + z13)>>2; //+2 !
+            tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562);
+            z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
+            tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
+            tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
+
+            tmp6 = tmp12 - tmp7;
+            tmp5 = tmp11 - tmp6;
+            tmp4 = tmp10 + tmp5;
+
+            wsptr[DCTSIZE*0]+=  (tmp0 + tmp7);
+            wsptr[DCTSIZE*1]+=  (tmp1 + tmp6);
+            wsptr[DCTSIZE*2]+=  (tmp2 + tmp5);
+            wsptr[DCTSIZE*3]+=  (tmp3 - tmp4);
+            wsptr[DCTSIZE*4]+=  (tmp3 + tmp4);
+            wsptr[DCTSIZE*5]+=  (tmp2 - tmp5);
+            wsptr[DCTSIZE*6]=  (tmp1 - tmp6);
+            wsptr[DCTSIZE*7]=  (tmp0 - tmp7);
+            //
+            dataptr++; //next column
+            wsptr++;
+            threshold++;
+        }
+        dataptr+=8; //skip each second start pos
+        wsptr  +=8;
+    }
+}
+
+#else /* HAVE_MMX */
+
+static void column_fidct_mmx(int16_t* thr_adr,  int16_t *data,  int16_t *output,  int cnt)
+{
+    DECLARE_ALIGNED(8, uint64_t, temps)[4];
+    __asm__ volatile(
+        ASMALIGN(4)
+        "1:                   \n\t"
+        "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
+        //
+        "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
+        "movq %%mm1, %%mm0             \n\t"
+
+        "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
+        "movq %%mm7, %%mm3             \n\t"
+
+        "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
+        "movq %%mm1, %%mm5             \n\t"
+
+        "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
+        "psubw %%mm7, %%mm1            \n\t" //t13
+
+        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
+        "movq %%mm6, %%mm4             \n\t"
+
+        "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
+        "paddw %%mm7, %%mm5            \n\t" //t10
+
+        "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
+        "movq %%mm6, %%mm7             \n\t"
+
+        "paddw %%mm2, %%mm6            \n\t" //t11
+        "psubw %%mm2, %%mm7            \n\t" //t12
+
+        "movq %%mm5, %%mm2             \n\t"
+        "paddw %%mm6, %%mm5            \n\t" //d0
+        // i0 t13 t12 i3 i1 d0 - d4
+        "psubw %%mm6, %%mm2            \n\t" //d4
+        "paddw %%mm1, %%mm7            \n\t"
+
+        "movq  4*16(%%"REG_d"), %%mm6      \n\t"
+        "psllw $2, %%mm7              \n\t"
+
+        "psubw 0*16(%%"REG_d"), %%mm5      \n\t"
+        "psubw %%mm6, %%mm2            \n\t"
+
+        "paddusw 0*16(%%"REG_d"), %%mm5    \n\t"
+        "paddusw %%mm6, %%mm2          \n\t"
+
+        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
+        //
+        "paddw 0*16(%%"REG_d"), %%mm5      \n\t"
+        "paddw %%mm6, %%mm2            \n\t"
+
+        "psubusw 0*16(%%"REG_d"), %%mm5    \n\t"
+        "psubusw %%mm6, %%mm2          \n\t"
+
+//This func is totally compute-bound,  operates at huge speed. So,  DC shortcut
+// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
+//However,  typical numbers: nondc - 29%%,  dc - 46%%,  zero - 25%%. All <> 0 case is very rare.
+        "paddw "MANGLE(MM_2)", %%mm5            \n\t"
+        "movq %%mm2, %%mm6             \n\t"
+
+        "paddw %%mm5, %%mm2            \n\t"
+        "psubw %%mm6, %%mm5            \n\t"
+
+        "movq %%mm1, %%mm6             \n\t"
+        "paddw %%mm7, %%mm1            \n\t" //d2
+
+        "psubw 2*16(%%"REG_d"), %%mm1      \n\t"
+        "psubw %%mm7, %%mm6            \n\t" //d6
+
+        "movq 6*16(%%"REG_d"), %%mm7       \n\t"
+        "psraw $2, %%mm5              \n\t"
+
+        "paddusw 2*16(%%"REG_d"), %%mm1    \n\t"
+        "psubw %%mm7, %%mm6            \n\t"
+        // t7 d2 /t11 t4 t6 - d6 /t10
+
+        "paddw 2*16(%%"REG_d"), %%mm1      \n\t"
+        "paddusw %%mm7, %%mm6          \n\t"
+
+        "psubusw 2*16(%%"REG_d"), %%mm1    \n\t"
+        "paddw %%mm7, %%mm6            \n\t"
+
+        "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
+        "psubusw %%mm7, %%mm6          \n\t"
+
+        //movq [edi+"DCTSIZE_S"*2*2], mm1
+        //movq [edi+"DCTSIZE_S"*6*2], mm6
+        "movq %%mm1, %%mm7             \n\t"
+        "psraw $2, %%mm2              \n\t"
+
+        "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
+        "psubw %%mm6, %%mm1            \n\t"
+
+        "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
+        "paddw %%mm7, %%mm6            \n\t" //'t13
+
+        "psraw $2, %%mm6              \n\t" //paddw mm6, MM_2 !!    ---
+        "movq %%mm2, %%mm7             \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
+        "paddw %%mm6, %%mm2            \n\t" //'t0
+
+        "movq %%mm2, 0*8+%3            \n\t" //!
+        "psubw %%mm6, %%mm7            \n\t" //'t3
+
+        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
+        "psubw %%mm6, %%mm1            \n\t" //'t12
+
+        "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
+        "movq %%mm5, %%mm6             \n\t"
+
+        "movq %%mm7, 3*8+%3            \n\t"
+        "paddw %%mm2, %%mm3            \n\t" //t10
+
+        "paddw %%mm4, %%mm2            \n\t" //t11
+        "paddw %%mm0, %%mm4            \n\t" //t12
+
+        "movq %%mm3, %%mm7             \n\t"
+        "psubw %%mm4, %%mm3            \n\t"
+
+        "psllw $2, %%mm3              \n\t"
+        "psllw $2, %%mm7              \n\t" //opt for P6
+
+        "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
+        "psllw $2, %%mm4              \n\t"
+
+        "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
+        "psllw $2, %%mm2              \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
+        "paddw %%mm1, %%mm5            \n\t" //'t1
+
+        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
+        "psubw %%mm1, %%mm6            \n\t" //'t2
+        // t7 't12 't11 t4 t6 - 't13 't10   ---
+
+        "paddw %%mm3, %%mm7            \n\t" //z2
+
+        "movq %%mm5, 1*8+%3            \n\t"
+        "paddw %%mm3, %%mm4            \n\t" //z4
+
+        "movq 3*16(%%"REG_d"), %%mm3       \n\t"
+        "movq %%mm0, %%mm1             \n\t"
+
+        "movq %%mm6, 2*8+%3            \n\t"
+        "psubw %%mm2, %%mm1            \n\t" //z13
+
+//===
+        "paddw %%mm2, %%mm0            \n\t" //z11
+        "movq %%mm1, %%mm5             \n\t"
+
+        "movq 5*16(%%"REG_d"), %%mm2       \n\t"
+        "psubw %%mm7, %%mm1            \n\t" //d3
+
+        "paddw %%mm7, %%mm5            \n\t" //d5
+        "psubw %%mm3, %%mm1            \n\t"
+
+        "movq 1*16(%%"REG_d"), %%mm7       \n\t"
+        "psubw %%mm2, %%mm5            \n\t"
+
+        "movq %%mm0, %%mm6             \n\t"
+        "paddw %%mm4, %%mm0            \n\t" //d1
+
+        "paddusw %%mm3, %%mm1          \n\t"
+        "psubw %%mm4, %%mm6            \n\t" //d7
+
+        // d1 d3 - - - d5 d7 -
+        "movq 7*16(%%"REG_d"), %%mm4       \n\t"
+        "psubw %%mm7, %%mm0            \n\t"
+
+        "psubw %%mm4, %%mm6            \n\t"
+        "paddusw %%mm2, %%mm5          \n\t"
+
+        "paddusw %%mm4, %%mm6          \n\t"
+        "paddw %%mm3, %%mm1            \n\t"
+
+        "paddw %%mm2, %%mm5            \n\t"
+        "paddw %%mm4, %%mm6            \n\t"
+
+        "psubusw %%mm3, %%mm1          \n\t"
+        "psubusw %%mm2, %%mm5          \n\t"
+
+        "psubusw %%mm4, %%mm6          \n\t"
+        "movq %%mm1, %%mm4             \n\t"
+
+        "por %%mm5, %%mm4              \n\t"
+        "paddusw %%mm7, %%mm0          \n\t"
+
+        "por %%mm6, %%mm4              \n\t"
+        "paddw %%mm7, %%mm0            \n\t"
+
+        "packssdw %%mm4, %%mm4         \n\t"
+        "psubusw %%mm7, %%mm0          \n\t"
+
+        "movd %%mm4, %%"REG_a"             \n\t"
+        "or %%"REG_a", %%"REG_a"              \n\t"
+        "jnz 2f                 \n\t"
+        //movq [edi+"DCTSIZE_S"*3*2], mm1
+        //movq [edi+"DCTSIZE_S"*5*2], mm5
+        //movq [edi+"DCTSIZE_S"*1*2], mm0
+        //movq [edi+"DCTSIZE_S"*7*2], mm6
+        // t4 t5 - - - t6 t7 -
+        //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
+//Typical numbers: nondc - 19%%,  dc - 26%%,  zero - 55%%. zero case alone isn't worthwhile
+        "movq 0*8+%3, %%mm4            \n\t"
+        "movq %%mm0, %%mm1             \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
+        "movq %%mm1, %%mm2             \n\t"
+
+        "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
+        "movq %%mm2, %%mm3             \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
+        "paddw %%mm4, %%mm5            \n\t"
+
+        "movq 1*8+%3, %%mm6            \n\t"
+        //paddw mm3, MM_2
+        "psraw $2, %%mm3              \n\t" //tmp7
+
+        "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
+        "psubw %%mm3, %%mm4            \n\t"
+
+        "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
+        "paddw %%mm3, %%mm5            \n\t"
+
+        "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
+        "paddw %%mm6, %%mm7            \n\t"
+
+        "movq 2*8+%3, %%mm3            \n\t"
+        "psubw %%mm0, %%mm6            \n\t"
+
+        "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
+        "paddw %%mm0, %%mm7            \n\t"
+
+        "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
+        "paddw %%mm3, %%mm4            \n\t"
+
+        "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
+        "psubw %%mm1, %%mm3            \n\t"
+
+        "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
+        "paddw %%mm1, %%mm4            \n\t"
+
+        "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
+        "paddw %%mm3, %%mm5            \n\t"
+
+        "movq 3*8+%3, %%mm0            \n\t"
+        "add $8, %%"REG_S"               \n\t"
+
+        "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
+        "paddw %%mm0, %%mm6            \n\t"
+
+        "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
+        "psubw %%mm2, %%mm0            \n\t"
+
+        "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
+        "paddw %%mm2, %%mm6            \n\t"
+
+        "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
+        "paddw %%mm0, %%mm7            \n\t"
+
+        "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
+
+        "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
+        "add $8, %%"REG_D"               \n\t"
+        "jmp 4f                  \n\t"
+
+        "2:                    \n\t"
+        //--- non DC2
+        //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1  (actually thr1, thr1, thr1-1)
+        //psraw mm5, 2
+        //psraw mm0, 2
+        //psraw mm6, 2
+        "movq %%mm5, %%mm3             \n\t"
+        "psubw %%mm1, %%mm5            \n\t"
+
+        "psllw $1, %%mm5              \n\t" //'z10
+        "paddw %%mm1, %%mm3            \n\t" //'z13
+
+        "movq %%mm0, %%mm2             \n\t"
+        "psubw %%mm6, %%mm0            \n\t"
+
+        "movq %%mm5, %%mm1             \n\t"
+        "psllw $1, %%mm0              \n\t" //'z12
+
+        "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
+        "paddw %%mm0, %%mm5            \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
+        "paddw %%mm6, %%mm2            \n\t" //'z11
+
+        "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
+        "movq %%mm2, %%mm7             \n\t"
+
+        //---
+        "movq 0*8+%3, %%mm4            \n\t"
+        "psubw %%mm3, %%mm2            \n\t"
+
+        "psllw $1, %%mm2              \n\t"
+        "paddw %%mm3, %%mm7            \n\t" //'t7
+
+        "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
+        "movq %%mm4, %%mm6             \n\t"
+        //paddw mm7, MM_2
+        "psraw $2, %%mm7              \n\t"
+
+        "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
+        "psubw %%mm7, %%mm6            \n\t"
+
+        "movq 1*8+%3, %%mm3            \n\t"
+        "paddw %%mm7, %%mm4            \n\t"
+
+        "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
+        "paddw %%mm5, %%mm1            \n\t" //'t12
+
+        "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
+        "psubw %%mm7, %%mm1            \n\t" //'t6
+
+        "movq 2*8+%3, %%mm7            \n\t"
+        "psubw %%mm5, %%mm0            \n\t" //'t10
+
+        "movq 3*8+%3, %%mm6            \n\t"
+        "movq %%mm3, %%mm5             \n\t"
+
+        "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
+        "psubw %%mm1, %%mm5            \n\t"
+
+        "psubw %%mm1, %%mm2            \n\t" //'t5
+        "paddw %%mm1, %%mm3            \n\t"
+
+        "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
+        "movq %%mm7, %%mm4             \n\t"
+
+        "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
+        "psubw %%mm2, %%mm4            \n\t"
+
+        "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
+        "paddw %%mm2, %%mm7            \n\t"
+
+        "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
+        "paddw %%mm2, %%mm0            \n\t" //'t4
+
+        // 't4 't6 't5 - - - - 't7
+        "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
+        "movq %%mm6, %%mm1             \n\t"
+
+        "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
+        "psubw %%mm0, %%mm1            \n\t"
+
+        "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
+        "paddw %%mm0, %%mm6            \n\t"
+
+        "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
+        "add $8, %%"REG_S"               \n\t"
+
+        "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
+
+        "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
+        "add $8, %%"REG_D"               \n\t"
+
+        "4:                     \n\t"
+//=part 2 (the same)===========================================================
+        "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
+        //
+        "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
+        "movq %%mm1, %%mm0             \n\t"
+
+        "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
+        "movq %%mm7, %%mm3             \n\t"
+
+        "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
+        "movq %%mm1, %%mm5             \n\t"
+
+        "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
+        "psubw %%mm7, %%mm1            \n\t" //t13
+
+        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
+        "movq %%mm6, %%mm4             \n\t"
+
+        "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
+        "paddw %%mm7, %%mm5            \n\t" //t10
+
+        "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
+        "movq %%mm6, %%mm7             \n\t"
+
+        "paddw %%mm2, %%mm6            \n\t" //t11
+        "psubw %%mm2, %%mm7            \n\t" //t12
+
+        "movq %%mm5, %%mm2             \n\t"
+        "paddw %%mm6, %%mm5            \n\t" //d0
+        // i0 t13 t12 i3 i1 d0 - d4
+        "psubw %%mm6, %%mm2            \n\t" //d4
+        "paddw %%mm1, %%mm7            \n\t"
+
+        "movq  1*8+4*16(%%"REG_d"), %%mm6  \n\t"
+        "psllw $2, %%mm7              \n\t"
+
+        "psubw 1*8+0*16(%%"REG_d"), %%mm5  \n\t"
+        "psubw %%mm6, %%mm2            \n\t"
+
+        "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
+        "paddusw %%mm6, %%mm2          \n\t"
+
+        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
+        //
+        "paddw 1*8+0*16(%%"REG_d"), %%mm5  \n\t"
+        "paddw %%mm6, %%mm2            \n\t"
+
+        "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
+        "psubusw %%mm6, %%mm2          \n\t"
+
+//This func is totally compute-bound,  operates at huge speed. So,  DC shortcut
+// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
+//However,  typical numbers: nondc - 29%%,  dc - 46%%,  zero - 25%%. All <> 0 case is very rare.
+        "paddw "MANGLE(MM_2)", %%mm5            \n\t"
+        "movq %%mm2, %%mm6             \n\t"
+
+        "paddw %%mm5, %%mm2            \n\t"
+        "psubw %%mm6, %%mm5            \n\t"
+
+        "movq %%mm1, %%mm6             \n\t"
+        "paddw %%mm7, %%mm1            \n\t" //d2
+
+        "psubw 1*8+2*16(%%"REG_d"), %%mm1  \n\t"
+        "psubw %%mm7, %%mm6            \n\t" //d6
+
+        "movq 1*8+6*16(%%"REG_d"), %%mm7   \n\t"
+        "psraw $2, %%mm5              \n\t"
+
+        "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
+        "psubw %%mm7, %%mm6            \n\t"
+        // t7 d2 /t11 t4 t6 - d6 /t10
+
+        "paddw 1*8+2*16(%%"REG_d"), %%mm1  \n\t"
+        "paddusw %%mm7, %%mm6          \n\t"
+
+        "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
+        "paddw %%mm7, %%mm6            \n\t"
+
+        "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
+        "psubusw %%mm7, %%mm6          \n\t"
+
+        //movq [edi+"DCTSIZE_S"*2*2], mm1
+        //movq [edi+"DCTSIZE_S"*6*2], mm6
+        "movq %%mm1, %%mm7             \n\t"
+        "psraw $2, %%mm2              \n\t"
+
+        "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
+        "psubw %%mm6, %%mm1            \n\t"
+
+        "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
+        "paddw %%mm7, %%mm6            \n\t" //'t13
+
+        "psraw $2, %%mm6              \n\t" //paddw mm6, MM_2 !!    ---
+        "movq %%mm2, %%mm7             \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
+        "paddw %%mm6, %%mm2            \n\t" //'t0
+
+        "movq %%mm2, 0*8+%3            \n\t" //!
+        "psubw %%mm6, %%mm7            \n\t" //'t3
+
+        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
+        "psubw %%mm6, %%mm1            \n\t" //'t12
+
+        "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
+        "movq %%mm5, %%mm6             \n\t"
+
+        "movq %%mm7, 3*8+%3            \n\t"
+        "paddw %%mm2, %%mm3            \n\t" //t10
+
+        "paddw %%mm4, %%mm2            \n\t" //t11
+        "paddw %%mm0, %%mm4            \n\t" //t12
+
+        "movq %%mm3, %%mm7             \n\t"
+        "psubw %%mm4, %%mm3            \n\t"
+
+        "psllw $2, %%mm3              \n\t"
+        "psllw $2, %%mm7              \n\t" //opt for P6
+
+        "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
+        "psllw $2, %%mm4              \n\t"
+
+        "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
+        "psllw $2, %%mm2              \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
+        "paddw %%mm1, %%mm5            \n\t" //'t1
+
+        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
+        "psubw %%mm1, %%mm6            \n\t" //'t2
+        // t7 't12 't11 t4 t6 - 't13 't10   ---
+
+        "paddw %%mm3, %%mm7            \n\t" //z2
+
+        "movq %%mm5, 1*8+%3            \n\t"
+        "paddw %%mm3, %%mm4            \n\t" //z4
+
+        "movq 1*8+3*16(%%"REG_d"), %%mm3   \n\t"
+        "movq %%mm0, %%mm1             \n\t"
+
+        "movq %%mm6, 2*8+%3            \n\t"
+        "psubw %%mm2, %%mm1            \n\t" //z13
+
+//===
+        "paddw %%mm2, %%mm0            \n\t" //z11
+        "movq %%mm1, %%mm5             \n\t"
+
+        "movq 1*8+5*16(%%"REG_d"), %%mm2   \n\t"
+        "psubw %%mm7, %%mm1            \n\t" //d3
+
+        "paddw %%mm7, %%mm5            \n\t" //d5
+        "psubw %%mm3, %%mm1            \n\t"
+
+        "movq 1*8+1*16(%%"REG_d"), %%mm7   \n\t"
+        "psubw %%mm2, %%mm5            \n\t"
+
+        "movq %%mm0, %%mm6             \n\t"
+        "paddw %%mm4, %%mm0            \n\t" //d1
+
+        "paddusw %%mm3, %%mm1          \n\t"
+        "psubw %%mm4, %%mm6            \n\t" //d7
+
+        // d1 d3 - - - d5 d7 -
+        "movq 1*8+7*16(%%"REG_d"), %%mm4   \n\t"
+        "psubw %%mm7, %%mm0            \n\t"
+
+        "psubw %%mm4, %%mm6            \n\t"
+        "paddusw %%mm2, %%mm5          \n\t"
+
+        "paddusw %%mm4, %%mm6          \n\t"
+        "paddw %%mm3, %%mm1            \n\t"
+
+        "paddw %%mm2, %%mm5            \n\t"
+        "paddw %%mm4, %%mm6            \n\t"
+
+        "psubusw %%mm3, %%mm1          \n\t"
+        "psubusw %%mm2, %%mm5          \n\t"
+
+        "psubusw %%mm4, %%mm6          \n\t"
+        "movq %%mm1, %%mm4             \n\t"
+
+        "por %%mm5, %%mm4              \n\t"
+        "paddusw %%mm7, %%mm0          \n\t"
+
+        "por %%mm6, %%mm4              \n\t"
+        "paddw %%mm7, %%mm0            \n\t"
+
+        "packssdw %%mm4, %%mm4         \n\t"
+        "psubusw %%mm7, %%mm0          \n\t"
+
+        "movd %%mm4, %%"REG_a"             \n\t"
+        "or %%"REG_a", %%"REG_a"              \n\t"
+        "jnz 3f                 \n\t"
+        //movq [edi+"DCTSIZE_S"*3*2], mm1
+        //movq [edi+"DCTSIZE_S"*5*2], mm5
+        //movq [edi+"DCTSIZE_S"*1*2], mm0
+        //movq [edi+"DCTSIZE_S"*7*2], mm6
+        // t4 t5 - - - t6 t7 -
+        //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
+//Typical numbers: nondc - 19%%,  dc - 26%%,  zero - 55%%. zero case alone isn't worthwhile
+        "movq 0*8+%3, %%mm4            \n\t"
+        "movq %%mm0, %%mm1             \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
+        "movq %%mm1, %%mm2             \n\t"
+
+        "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
+        "movq %%mm2, %%mm3             \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
+        "paddw %%mm4, %%mm5            \n\t"
+
+        "movq 1*8+%3, %%mm6            \n\t"
+        //paddw mm3, MM_2
+        "psraw $2, %%mm3              \n\t" //tmp7
+
+        "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
+        "psubw %%mm3, %%mm4            \n\t"
+
+        "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
+        "paddw %%mm3, %%mm5            \n\t"
+
+        "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
+        "paddw %%mm6, %%mm7            \n\t"
+
+        "movq 2*8+%3, %%mm3            \n\t"
+        "psubw %%mm0, %%mm6            \n\t"
+
+        "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
+        "paddw %%mm0, %%mm7            \n\t"
+
+        "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
+        "paddw %%mm3, %%mm4            \n\t"
+
+        "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
+        "psubw %%mm1, %%mm3            \n\t"
+
+        "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
+        "paddw %%mm1, %%mm4            \n\t"
+
+        "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
+        "paddw %%mm3, %%mm5            \n\t"
+
+        "movq 3*8+%3, %%mm0            \n\t"
+        "add $24, %%"REG_S"              \n\t"
+
+        "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
+        "paddw %%mm0, %%mm6            \n\t"
+
+        "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
+        "psubw %%mm2, %%mm0            \n\t"
+
+        "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
+        "paddw %%mm2, %%mm6            \n\t"
+
+        "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
+        "paddw %%mm0, %%mm7            \n\t"
+
+        "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
+
+        "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
+        "add $24, %%"REG_D"              \n\t"
+        "sub $2, %%"REG_c"               \n\t"
+        "jnz 1b                \n\t"
+        "jmp 5f                   \n\t"
+
+        "3:                    \n\t"
+        //--- non DC2
+        //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1  (actually thr1, thr1, thr1-1)
+        //psraw mm5, 2
+        //psraw mm0, 2
+        //psraw mm6, 2
+        "movq %%mm5, %%mm3             \n\t"
+        "psubw %%mm1, %%mm5            \n\t"
+
+        "psllw $1, %%mm5              \n\t" //'z10
+        "paddw %%mm1, %%mm3            \n\t" //'z13
+
+        "movq %%mm0, %%mm2             \n\t"
+        "psubw %%mm6, %%mm0            \n\t"
+
+        "movq %%mm5, %%mm1             \n\t"
+        "psllw $1, %%mm0              \n\t" //'z12
+
+        "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
+        "paddw %%mm0, %%mm5            \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
+        "paddw %%mm6, %%mm2            \n\t" //'z11
+
+        "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
+        "movq %%mm2, %%mm7             \n\t"
+
+        //---
+        "movq 0*8+%3, %%mm4            \n\t"
+        "psubw %%mm3, %%mm2            \n\t"
+
+        "psllw $1, %%mm2              \n\t"
+        "paddw %%mm3, %%mm7            \n\t" //'t7
+
+        "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
+        "movq %%mm4, %%mm6             \n\t"
+        //paddw mm7, MM_2
+        "psraw $2, %%mm7              \n\t"
+
+        "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
+        "psubw %%mm7, %%mm6            \n\t"
+
+        "movq 1*8+%3, %%mm3            \n\t"
+        "paddw %%mm7, %%mm4            \n\t"
+
+        "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
+        "paddw %%mm5, %%mm1            \n\t" //'t12
+
+        "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
+        "psubw %%mm7, %%mm1            \n\t" //'t6
+
+        "movq 2*8+%3, %%mm7            \n\t"
+        "psubw %%mm5, %%mm0            \n\t" //'t10
+
+        "movq 3*8+%3, %%mm6            \n\t"
+        "movq %%mm3, %%mm5             \n\t"
+
+        "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
+        "psubw %%mm1, %%mm5            \n\t"
+
+        "psubw %%mm1, %%mm2            \n\t" //'t5
+        "paddw %%mm1, %%mm3            \n\t"
+
+        "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
+        "movq %%mm7, %%mm4             \n\t"
+
+        "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
+        "psubw %%mm2, %%mm4            \n\t"
+
+        "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
+        "paddw %%mm2, %%mm7            \n\t"
+
+        "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
+        "paddw %%mm2, %%mm0            \n\t" //'t4
+
+        // 't4 't6 't5 - - - - 't7
+        "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
+        "movq %%mm6, %%mm1             \n\t"
+
+        "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
+        "psubw %%mm0, %%mm1            \n\t"
+
+        "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
+        "paddw %%mm0, %%mm6            \n\t"
+
+        "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
+        "add $24, %%"REG_S"              \n\t"
+
+        "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
+
+        "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
+        "add $24, %%"REG_D"              \n\t"
+        "sub $2, %%"REG_c"               \n\t"
+        "jnz 1b                \n\t"
+        "5:                      \n\t"
+
+        : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
+        : "d"(thr_adr)
+          NAMED_CONSTRAINTS_ADD(ff_MM_FIX_0_707106781,MM_2,MM_FIX_1_414213562_A,MM_FIX_1_414213562,MM_FIX_0_382683433,
+          ff_MM_FIX_0_541196100,MM_FIX_1_306562965,MM_FIX_0_847759065)
+          NAMED_CONSTRAINTS_ADD(MM_FIX_0_566454497,MM_FIX_0_198912367,MM_FIX_2_613125930,MM_FIX_1_847759065,
+          MM_FIX_1_082392200)
+        : "%"REG_a
+        );
+}
+
+#endif // HAVE_MMX
+
+#if !HAVE_MMX
+
+static void row_idct_c(int16_t* workspace,
+                       int16_t* output_adr, int output_stride, int cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z5, z10, z11, z12, z13;
+    int16_t* outptr;
+    int16_t* wsptr;
+
+    cnt*=4;
+    wsptr = workspace;
+    outptr = output_adr;
+    for (; cnt > 0; cnt--) {
+        // Even part
+        //Simd version reads 4x4 block and transposes it
+        tmp10 = ( wsptr[2] +  wsptr[3]);
+        tmp11 = ( wsptr[2] -  wsptr[3]);
+
+        tmp13 = ( wsptr[0] +  wsptr[1]);
+        tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;//this shift order to avoid overflow
+
+        tmp0 = tmp10 + tmp13; //->temps
+        tmp3 = tmp10 - tmp13; //->temps
+        tmp1 = tmp11 + tmp12;
+        tmp2 = tmp11 - tmp12;
+
+        // Odd part
+        //Also transpose, with previous:
+        // ---- ----      ||||
+        // ---- ---- idct ||||
+        // ---- ---- ---> ||||
+        // ---- ----      ||||
+        z13 = wsptr[4] + wsptr[5];
+        z10 = wsptr[4] - wsptr[5];
+        z11 = wsptr[6] + wsptr[7];
+        z12 = wsptr[6] - wsptr[7];
+
+        tmp7 = z11 + z13;
+        tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
+
+        z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
+        tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
+        tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
+
+        tmp6 = (tmp12<<3) - tmp7;
+        tmp5 = (tmp11<<3) - tmp6;
+        tmp4 = (tmp10<<3) + tmp5;
+
+        // Final output stage: descale and write column
+        outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3);
+        outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3);
+        outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3);
+        outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3);
+        outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3);
+        outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3);
+        outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); //no += ?
+        outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); //no += ?
+        outptr++;
+
+        wsptr += DCTSIZE;       // advance pointer to next row
+    }
+}
+
+#else /* HAVE_MMX */
+
+static void row_idct_mmx (int16_t* workspace,
+                          int16_t* output_adr,  int output_stride,  int cnt)
+{
+    DECLARE_ALIGNED(8, uint64_t, temps)[4];
+    __asm__ volatile(
+        "lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
+        "1:                     \n\t"
+        "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
+        //
+
+        "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
+        "movq %%mm0, %%mm4             \n\t"
+
+        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
+        "punpcklwd %%mm1, %%mm0        \n\t"
+
+        "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
+        "punpckhwd %%mm1, %%mm4        \n\t"
+
+        //transpose 4x4
+        "movq %%mm2, %%mm7             \n\t"
+        "punpcklwd %%mm3, %%mm2        \n\t"
+
+        "movq %%mm0, %%mm6             \n\t"
+        "punpckldq %%mm2, %%mm0        \n\t" //0
+
+        "punpckhdq %%mm2, %%mm6        \n\t" //1
+        "movq %%mm0, %%mm5             \n\t"
+
+        "punpckhwd %%mm3, %%mm7        \n\t"
+        "psubw %%mm6, %%mm0            \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
+        "movq %%mm4, %%mm2             \n\t"
+
+        "punpckldq %%mm7, %%mm4        \n\t" //2
+        "paddw %%mm6, %%mm5            \n\t"
+
+        "punpckhdq %%mm7, %%mm2        \n\t" //3
+        "movq %%mm4, %%mm1             \n\t"
+
+        "psllw $2, %%mm0              \n\t"
+        "paddw %%mm2, %%mm4            \n\t" //t10
+
+        "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
+        "psubw %%mm2, %%mm1            \n\t" //t11
+
+        "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
+        "psubw %%mm5, %%mm0            \n\t"
+
+        "movq %%mm4, %%mm6             \n\t"
+        "paddw %%mm5, %%mm4            \n\t" //t0
+
+        "psubw %%mm5, %%mm6            \n\t" //t3
+        "movq %%mm1, %%mm7             \n\t"
+
+        "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
+        "paddw %%mm0, %%mm1            \n\t" //t1
+
+        "movq %%mm4, 0*8+%3            \n\t" //t0
+        "movq %%mm3, %%mm4             \n\t"
+
+        "movq %%mm6, 1*8+%3            \n\t" //t3
+        "punpcklwd %%mm2, %%mm3        \n\t"
+
+        //transpose 4x4
+        "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
+        "punpckhwd %%mm2, %%mm4        \n\t"
+
+        "movq %%mm5, %%mm2             \n\t"
+        "punpcklwd %%mm6, %%mm5        \n\t"
+
+        "psubw %%mm0, %%mm7            \n\t" //t2
+        "punpckhwd %%mm6, %%mm2        \n\t"
+
+        "movq %%mm3, %%mm0             \n\t"
+        "punpckldq %%mm5, %%mm3        \n\t" //4
+
+        "punpckhdq %%mm5, %%mm0        \n\t" //5
+        "movq %%mm4, %%mm5             \n\t"
+
+        //
+        "movq %%mm3, %%mm6             \n\t"
+        "punpckldq %%mm2, %%mm4        \n\t" //6
+
+        "psubw %%mm0, %%mm3            \n\t" //z10
+        "punpckhdq %%mm2, %%mm5        \n\t" //7
+
+        "paddw %%mm0, %%mm6            \n\t" //z13
+        "movq %%mm4, %%mm2             \n\t"
+
+        "movq %%mm3, %%mm0             \n\t"
+        "psubw %%mm5, %%mm4            \n\t" //z12
+
+        "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t" //-
+        "paddw %%mm4, %%mm3            \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t" //z5
+        "paddw %%mm5, %%mm2            \n\t" //z11  >
+
+        "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t"
+        "movq %%mm2, %%mm5             \n\t"
+
+        "psubw %%mm6, %%mm2            \n\t"
+        "paddw %%mm6, %%mm5            \n\t" //t7
+
+        "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //t11
+        "paddw %%mm3, %%mm0            \n\t" //t12
+
+        "psllw $3, %%mm0              \n\t"
+        "psubw %%mm3, %%mm4            \n\t" //t10
+
+        "movq 0*8+%3, %%mm6            \n\t"
+        "movq %%mm1, %%mm3             \n\t"
+
+        "psllw $3, %%mm4              \n\t"
+        "psubw %%mm5, %%mm0            \n\t" //t6
+
+        "psllw $3, %%mm2              \n\t"
+        "paddw %%mm0, %%mm1            \n\t" //d1
+
+        "psubw %%mm0, %%mm2            \n\t" //t5
+        "psubw %%mm0, %%mm3            \n\t" //d6
+
+        "paddw %%mm2, %%mm4            \n\t" //t4
+        "movq %%mm7, %%mm0             \n\t"
+
+        "paddw %%mm2, %%mm7            \n\t" //d2
+        "psubw %%mm2, %%mm0            \n\t" //d5
+
+        "movq "MANGLE(MM_DESCALE_RND)", %%mm2   \n\t" //4
+        "psubw %%mm5, %%mm6            \n\t" //d7
+
+        "paddw 0*8+%3, %%mm5           \n\t" //d0
+        "paddw %%mm2, %%mm1            \n\t"
+
+        "paddw %%mm2, %%mm5            \n\t"
+        "psraw $3, %%mm1              \n\t"
+
+        "paddw %%mm2, %%mm7            \n\t"
+        "psraw $3, %%mm5              \n\t"
+
+        "paddw (%%"REG_D"), %%mm5          \n\t"
+        "psraw $3, %%mm7              \n\t"
+
+        "paddw (%%"REG_D",%%"REG_a"), %%mm1    \n\t"
+        "paddw %%mm2, %%mm0            \n\t"
+
+        "paddw (%%"REG_D",%%"REG_a",2), %%mm7   \n\t"
+        "paddw %%mm2, %%mm3            \n\t"
+
+        "movq %%mm5, (%%"REG_D")           \n\t"
+        "paddw %%mm2, %%mm6            \n\t"
+
+        "movq %%mm1, (%%"REG_D",%%"REG_a")     \n\t"
+        "psraw $3, %%mm0              \n\t"
+
+        "movq %%mm7, (%%"REG_D",%%"REG_a",2)    \n\t"
+        "add %%"REG_d", %%"REG_D"             \n\t" //3*ls
+
+        "movq 1*8+%3, %%mm5           \n\t" //t3
+        "psraw $3, %%mm3              \n\t"
+
+        "paddw (%%"REG_D",%%"REG_a",2), %%mm0   \n\t"
+        "psubw %%mm4, %%mm5            \n\t" //d3
+
+        "paddw (%%"REG_D",%%"REG_d"), %%mm3    \n\t"
+        "psraw $3, %%mm6              \n\t"
+
+        "paddw 1*8+%3, %%mm4           \n\t" //d4
+        "paddw %%mm2, %%mm5            \n\t"
+
+        "paddw (%%"REG_D",%%"REG_a",4), %%mm6   \n\t"
+        "paddw %%mm2, %%mm4            \n\t"
+
+        "movq %%mm0, (%%"REG_D",%%"REG_a",2)    \n\t"
+        "psraw $3, %%mm5              \n\t"
+
+        "paddw (%%"REG_D"), %%mm5          \n\t"
+        "psraw $3, %%mm4              \n\t"
+
+        "paddw (%%"REG_D",%%"REG_a"), %%mm4    \n\t"
+        "add $"DCTSIZE_S"*2*4, %%"REG_S"      \n\t" //4 rows
+
+        "movq %%mm3, (%%"REG_D",%%"REG_d")     \n\t"
+        "movq %%mm6, (%%"REG_D",%%"REG_a",4)    \n\t"
+        "movq %%mm5, (%%"REG_D")           \n\t"
+        "movq %%mm4, (%%"REG_D",%%"REG_a")     \n\t"
+
+        "sub %%"REG_d", %%"REG_D"             \n\t"
+        "add $8, %%"REG_D"               \n\t"
+        "dec %%"REG_c"                   \n\t"
+        "jnz 1b                  \n\t"
+
+        : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
+        : "a"(output_stride*sizeof(short))
+        NAMED_CONSTRAINTS_ADD(MM_FIX_1_414213562_A,MM_FIX_2_613125930,MM_FIX_1_847759065,MM_FIX_1_082392200,
+        MM_FIX_1_414213562,MM_DESCALE_RND)
+        : "%"REG_d
+        );
+}
+
+#endif // HAVE_MMX
+
+#if !HAVE_MMX
+
+static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
+{
+    int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    int_simd16_t tmp10, tmp11, tmp12, tmp13;
+    int_simd16_t z1, z2, z3, z4, z5, z11, z13;
+    int16_t *dataptr;
+
+    cnt*=4;
+    // Pass 1: process rows.
+
+    dataptr = data;
+    for (; cnt > 0; cnt--) {
+        tmp0 = pixels[line_size*0] + pixels[line_size*7];
+        tmp7 = pixels[line_size*0] - pixels[line_size*7];
+        tmp1 = pixels[line_size*1] + pixels[line_size*6];
+        tmp6 = pixels[line_size*1] - pixels[line_size*6];
+        tmp2 = pixels[line_size*2] + pixels[line_size*5];
+        tmp5 = pixels[line_size*2] - pixels[line_size*5];
+        tmp3 = pixels[line_size*3] + pixels[line_size*4];
+        tmp4 = pixels[line_size*3] - pixels[line_size*4];
+
+        // Even part
+
+        tmp10 = tmp0 + tmp3;
+        tmp13 = tmp0 - tmp3;
+        tmp11 = tmp1 + tmp2;
+        tmp12 = tmp1 - tmp2;
+        //Even columns are written first, this leads to different order of columns
+        //in column_fidct(), but they are processed independently, so all ok.
+        //Later in the row_idct() columns readed at the same order.
+        dataptr[2] = tmp10 + tmp11;
+        dataptr[3] = tmp10 - tmp11;
+
+        z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781);
+        dataptr[0] = tmp13 + z1;
+        dataptr[1] = tmp13 - z1;
+
+        // Odd part
+
+        tmp10 = (tmp4 + tmp5) <<2;
+        tmp11 = (tmp5 + tmp6) <<2;
+        tmp12 = (tmp6 + tmp7) <<2;
+
+        z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
+        z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
+        z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
+        z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
+
+        z11 = tmp7 + z3;
+        z13 = tmp7 - z3;
+
+        dataptr[4] = z13 + z2;
+        dataptr[5] = z13 - z2;
+        dataptr[6] = z11 + z4;
+        dataptr[7] = z11 - z4;
+
+        pixels++;               // advance pointer to next column
+        dataptr += DCTSIZE;
+    }
+}
+
+#else /* HAVE_MMX */
+
+static void row_fdct_mmx(int16_t *data,  const uint8_t *pixels,  int line_size,  int cnt)
+{
+    DECLARE_ALIGNED(8, uint64_t, temps)[4];
+    __asm__ volatile(
+        "lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
+        "6:                     \n\t"
+        "movd (%%"REG_S"), %%mm0           \n\t"
+        "pxor %%mm7, %%mm7             \n\t"
+
+        "movd (%%"REG_S",%%"REG_a"), %%mm1     \n\t"
+        "punpcklbw %%mm7, %%mm0        \n\t"
+
+        "movd (%%"REG_S",%%"REG_a",2), %%mm2    \n\t"
+        "punpcklbw %%mm7, %%mm1        \n\t"
+
+        "punpcklbw %%mm7, %%mm2        \n\t"
+        "add %%"REG_d", %%"REG_S"             \n\t"
+
+        "movq %%mm0, %%mm5             \n\t"
+        //
+
+        "movd (%%"REG_S",%%"REG_a",4), %%mm3    \n\t" //7  ;prefetch!
+        "movq %%mm1, %%mm6             \n\t"
+
+        "movd (%%"REG_S",%%"REG_d"), %%mm4     \n\t" //6
+        "punpcklbw %%mm7, %%mm3        \n\t"
+
+        "psubw %%mm3, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm4        \n\t"
+
+        "paddw %%mm3, %%mm0            \n\t"
+        "psubw %%mm4, %%mm6            \n\t"
+
+        "movd (%%"REG_S",%%"REG_a",2), %%mm3    \n\t" //5
+        "paddw %%mm4, %%mm1            \n\t"
+
+        "movq %%mm5, %3                \n\t" //t7
+        "punpcklbw %%mm7, %%mm3        \n\t"
+
+        "movq %%mm6, %4                \n\t" //t6
+        "movq %%mm2, %%mm4             \n\t"
+
+        "movd (%%"REG_S"), %%mm5           \n\t" //3
+        "paddw %%mm3, %%mm2            \n\t"
+
+        "movd (%%"REG_S",%%"REG_a"), %%mm6     \n\t" //4
+        "punpcklbw %%mm7, %%mm5        \n\t"
+
+        "psubw %%mm3, %%mm4            \n\t"
+        "punpcklbw %%mm7, %%mm6        \n\t"
+
+        "movq %%mm5, %%mm3             \n\t"
+        "paddw %%mm6, %%mm5            \n\t" //t3
+
+        "psubw %%mm6, %%mm3            \n\t" //t4  ; t0 t1 t2 t4 t5 t3 - -
+        "movq %%mm0, %%mm6             \n\t"
+
+        "movq %%mm1, %%mm7             \n\t"
+        "psubw %%mm5, %%mm0            \n\t" //t13
+
+        "psubw %%mm2, %%mm1            \n\t"
+        "paddw %%mm2, %%mm7            \n\t" //t11
+
+        "paddw %%mm0, %%mm1            \n\t"
+        "movq %%mm7, %%mm2             \n\t"
+
+        "psllw $2, %%mm1              \n\t"
+        "paddw %%mm5, %%mm6            \n\t" //t10
+
+        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm1 \n\t"
+        "paddw %%mm6, %%mm7            \n\t" //d2
+
+        "psubw %%mm2, %%mm6            \n\t" //d3
+        "movq %%mm0, %%mm5             \n\t"
+
+        //transpose 4x4
+        "movq %%mm7, %%mm2             \n\t"
+        "punpcklwd %%mm6, %%mm7        \n\t"
+
+        "paddw %%mm1, %%mm0            \n\t" //d0
+        "punpckhwd %%mm6, %%mm2        \n\t"
+
+        "psubw %%mm1, %%mm5            \n\t" //d1
+        "movq %%mm0, %%mm6             \n\t"
+
+        "movq %4, %%mm1                \n\t"
+        "punpcklwd %%mm5, %%mm0        \n\t"
+
+        "punpckhwd %%mm5, %%mm6        \n\t"
+        "movq %%mm0, %%mm5             \n\t"
+
+        "punpckldq %%mm7, %%mm0        \n\t" //0
+        "paddw %%mm4, %%mm3            \n\t"
+
+        "punpckhdq %%mm7, %%mm5        \n\t" //1
+        "movq %%mm6, %%mm7             \n\t"
+
+        "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
+        "punpckldq %%mm2, %%mm6        \n\t" //2
+
+        "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
+        "punpckhdq %%mm2, %%mm7        \n\t" //3
+
+        "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
+        "paddw %%mm1, %%mm4            \n\t"
+
+        "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
+        "psllw $2, %%mm3              \n\t" //t10
+
+        "movq %3, %%mm2               \n\t"
+        "psllw $2, %%mm4              \n\t" //t11
+
+        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm4 \n\t" //z3
+        "paddw %%mm2, %%mm1            \n\t"
+
+        "psllw $2, %%mm1              \n\t" //t12
+        "movq %%mm3, %%mm0             \n\t"
+
+        "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm0 \n\t"
+        "psubw %%mm1, %%mm3            \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5
+        "movq %%mm2, %%mm5             \n\t"
+
+        "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
+        "psubw %%mm4, %%mm2            \n\t" //z13
+
+        "paddw %%mm4, %%mm5            \n\t" //z11
+        "movq %%mm2, %%mm6             \n\t"
+
+        "paddw %%mm3, %%mm0            \n\t" //z2
+        "movq %%mm5, %%mm7             \n\t"
+
+        "paddw %%mm0, %%mm2            \n\t" //d4
+        "psubw %%mm0, %%mm6            \n\t" //d5
+
+        "movq %%mm2, %%mm4             \n\t"
+        "paddw %%mm3, %%mm1            \n\t" //z4
+
+        //transpose 4x4
+        "punpcklwd %%mm6, %%mm2        \n\t"
+        "paddw %%mm1, %%mm5            \n\t" //d6
+
+        "punpckhwd %%mm6, %%mm4        \n\t"
+        "psubw %%mm1, %%mm7            \n\t" //d7
+
+        "movq %%mm5, %%mm6             \n\t"
+        "punpcklwd %%mm7, %%mm5        \n\t"
+
+        "punpckhwd %%mm7, %%mm6        \n\t"
+        "movq %%mm2, %%mm7             \n\t"
+
+        "punpckldq %%mm5, %%mm2        \n\t" //4
+        "sub %%"REG_d", %%"REG_S"             \n\t"
+
+        "punpckhdq %%mm5, %%mm7        \n\t" //5
+        "movq %%mm4, %%mm5             \n\t"
+
+        "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
+        "punpckldq %%mm6, %%mm4        \n\t" //6
+
+        "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
+        "punpckhdq %%mm6, %%mm5        \n\t" //7
+
+        "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
+        "add $4, %%"REG_S"               \n\t"
+
+        "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
+        "add $"DCTSIZE_S"*2*4, %%"REG_D"      \n\t" //4 rows
+        "dec %%"REG_c"                   \n\t"
+        "jnz 6b                  \n\t"
+
+        : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps), "=o"(temps[1])
+        : "a"(line_size)
+        NAMED_CONSTRAINTS_ADD(ff_MM_FIX_0_707106781,ff_MM_FIX_0_541196100,MM_FIX_0_382683433,MM_FIX_1_306562965)
+        : "%"REG_d);
+}
+
+#endif // HAVE_MMX
diff --git a/libavfilter/libmpcodecs/vf_ilpack.c b/libavfilter/libmpcodecs/vf_ilpack.c
new file mode 100644
index 0000000000..fbf5817062
--- /dev/null
+++ b/libavfilter/libmpcodecs/vf_ilpack.c
@@ -0,0 +1,458 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+
+#include "config.h"
+#include "mp_msg.h"
+#include "cpudetect.h"
+
+#include "img_format.h"
+#include "mp_image.h"
+#include "vf.h"
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+
+typedef void (pack_func_t)(unsigned char *dst, unsigned char *y,
+    unsigned char *u, unsigned char *v, int w, int us, int vs);
+
+struct vf_priv_s {
+    int mode;
+    pack_func_t *pack[2];
+};
+
+static void pack_nn_C(unsigned char *dst, unsigned char *y,
+    unsigned char *u, unsigned char *v, int w,
+    int av_unused us, int av_unused vs)
+{
+    int j;
+    for (j = w/2; j; j--) {
+        *dst++ = *y++;
+        *dst++ = *u++;
+        *dst++ = *y++;
+        *dst++ = *v++;
+    }
+}
+
+static void pack_li_0_C(unsigned char *dst, unsigned char *y,
+    unsigned char *u, unsigned char *v, int w, int us, int vs)
+{
+    int j;
+    for (j = w/2; j; j--) {
+        *dst++ = *y++;
+        *dst++ = (u[us+us] + 7*u[0])>>3;
+        *dst++ = *y++;
+        *dst++ = (v[vs+vs] + 7*v[0])>>3;
+        u++; v++;
+    }
+}
+
+static void pack_li_1_C(unsigned char *dst, unsigned char *y,
+    unsigned char *u, unsigned char *v, int w, int us, int vs)
+{
+    int j;
+    for (j = w/2; j; j--) {
+        *dst++ = *y++;
+        *dst++ = (3*u[us+us] + 5*u[0])>>3;
+        *dst++ = *y++;
+        *dst++ = (3*v[vs+vs] + 5*v[0])>>3;
+        u++; v++;
+    }
+}
+
+#if HAVE_MMX
+static void pack_nn_MMX(unsigned char *dst, unsigned char *y,
+    unsigned char *u, unsigned char *v, int w,
+    int av_unused us, int av_unused vs)
+{
+    __asm__ volatile (""
+        ASMALIGN(4)
+        "1: \n\t"
+        "movq (%0), %%mm1 \n\t"
+        "movq (%0), %%mm2 \n\t"
+        "movq (%1), %%mm4 \n\t"
+        "movq (%2), %%mm6 \n\t"
+        "punpcklbw %%mm6, %%mm4 \n\t"
+        "punpcklbw %%mm4, %%mm1 \n\t"
+        "punpckhbw %%mm4, %%mm2 \n\t"
+
+        "add $8, %0 \n\t"
+        "add $4, %1 \n\t"
+        "add $4, %2 \n\t"
+        "movq %%mm1, (%3) \n\t"
+        "movq %%mm2, 8(%3) \n\t"
+        "add $16, %3 \n\t"
+        "decl %4 \n\t"
+        "jnz 1b \n\t"
+        "emms \n\t"
+        :
+        : "r" (y), "r" (u), "r" (v), "r" (dst), "r" (w/8)
+        : "memory"
+        );
+    pack_nn_C(dst, y, u, v, (w&7), 0, 0);
+}
+
+#if HAVE_EBX_AVAILABLE
+static void pack_li_0_MMX(unsigned char *dst, unsigned char *y,
+    unsigned char *u, unsigned char *v, int w, int us, int vs)
+{
+    __asm__ volatile (""
+        "push %%"REG_BP" \n\t"
+#if ARCH_X86_64
+        "mov %6, %%"REG_BP" \n\t"
+#else
+        "movl 4(%%"REG_d"), %%"REG_BP" \n\t"
+        "movl (%%"REG_d"), %%"REG_d" \n\t"
+#endif
+        "pxor %%mm0, %%mm0 \n\t"
+
+        ASMALIGN(4)
+        "2: \n\t"
+        "movq (%%"REG_S"), %%mm1 \n\t"
+        "movq (%%"REG_S"), %%mm2 \n\t"
+
+        "movq (%%"REG_a",%%"REG_d",2), %%mm4 \n\t"
+        "movq (%%"REG_b",%%"REG_BP",2), %%mm6 \n\t"
+        "punpcklbw %%mm0, %%mm4 \n\t"
+        "punpcklbw %%mm0, %%mm6 \n\t"
+        "movq (%%"REG_a"), %%mm3 \n\t"
+        "movq (%%"REG_b"), %%mm5 \n\t"
+        "punpcklbw %%mm0, %%mm3 \n\t"
+        "punpcklbw %%mm0, %%mm5 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "psrlw $3, %%mm4 \n\t"
+        "psrlw $3, %%mm6 \n\t"
+        "packuswb %%mm4, %%mm4 \n\t"
+        "packuswb %%mm6, %%mm6 \n\t"
+        "punpcklbw %%mm6, %%mm4 \n\t"
+        "punpcklbw %%mm4, %%mm1 \n\t"
+        "punpckhbw %%mm4, %%mm2 \n\t"
+
+        "movq %%mm1, (%%"REG_D") \n\t"
+        "movq %%mm2, 8(%%"REG_D") \n\t"
+
+        "movq 8(%%"REG_S"), %%mm1 \n\t"
+        "movq 8(%%"REG_S"), %%mm2 \n\t"
+
+        "movq (%%"REG_a",%%"REG_d",2), %%mm4 \n\t"
+        "movq (%%"REG_b",%%"REG_BP",2), %%mm6 \n\t"
+        "punpckhbw %%mm0, %%mm4 \n\t"
+        "punpckhbw %%mm0, %%mm6 \n\t"
+        "movq (%%"REG_a"), %%mm3 \n\t"
+        "movq (%%"REG_b"), %%mm5 \n\t"
+        "punpckhbw %%mm0, %%mm3 \n\t"
+        "punpckhbw %%mm0, %%mm5 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "psrlw $3, %%mm4 \n\t"
+        "psrlw $3, %%mm6 \n\t"
+        "packuswb %%mm4, %%mm4 \n\t"
+        "packuswb %%mm6, %%mm6 \n\t"
+        "punpcklbw %%mm6, %%mm4 \n\t"
+        "punpcklbw %%mm4, %%mm1 \n\t"
+        "punpckhbw %%mm4, %%mm2 \n\t"
+
+        "add $16, %%"REG_S" \n\t"
+        "add $8, %%"REG_a" \n\t"
+        "add $8, %%"REG_b" \n\t"
+
+        "movq %%mm1, 16(%%"REG_D") \n\t"
+        "movq %%mm2, 24(%%"REG_D") \n\t"
+        "add $32, %%"REG_D" \n\t"
+
+        "decl %%ecx \n\t"
+        "jnz 2b \n\t"
+        "emms \n\t"
+        "pop %%"REG_BP" \n\t"
+        :
+        : "S" (y), "D" (dst), "a" (u), "b" (v), "c" (w/16),
+#if ARCH_X86_64
+        "d" ((x86_reg)us), "r" ((x86_reg)vs)
+#else
+        "d" (&us)
+#endif
+        : "memory"
+        );
+    pack_li_0_C(dst, y, u, v, (w&15), us, vs);
+}
+
+static void pack_li_1_MMX(unsigned char *dst, unsigned char *y,
+    unsigned char *u, unsigned char *v, int w, int us, int vs)
+{
+    __asm__ volatile (""
+        "push %%"REG_BP" \n\t"
+#if ARCH_X86_64
+        "mov %6, %%"REG_BP" \n\t"
+#else
+        "movl 4(%%"REG_d"), %%"REG_BP" \n\t"
+        "movl (%%"REG_d"), %%"REG_d" \n\t"
+#endif
+        "pxor %%mm0, %%mm0 \n\t"
+
+        ASMALIGN(4)
+        "3: \n\t"
+        "movq (%%"REG_S"), %%mm1 \n\t"
+        "movq (%%"REG_S"), %%mm2 \n\t"
+
+        "movq (%%"REG_a",%%"REG_d",2), %%mm4 \n\t"
+        "movq (%%"REG_b",%%"REG_BP",2), %%mm6 \n\t"
+        "punpcklbw %%mm0, %%mm4 \n\t"
+        "punpcklbw %%mm0, %%mm6 \n\t"
+        "movq (%%"REG_a"), %%mm3 \n\t"
+        "movq (%%"REG_b"), %%mm5 \n\t"
+        "punpcklbw %%mm0, %%mm3 \n\t"
+        "punpcklbw %%mm0, %%mm5 \n\t"
+        "movq %%mm4, %%mm7 \n\t"
+        "paddw %%mm4, %%mm4 \n\t"
+        "paddw %%mm7, %%mm4 \n\t"
+        "movq %%mm6, %%mm7 \n\t"
+        "paddw %%mm6, %%mm6 \n\t"
+        "paddw %%mm7, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "psrlw $3, %%mm4 \n\t"
+        "psrlw $3, %%mm6 \n\t"
+        "packuswb %%mm4, %%mm4 \n\t"
+        "packuswb %%mm6, %%mm6 \n\t"
+        "punpcklbw %%mm6, %%mm4 \n\t"
+        "punpcklbw %%mm4, %%mm1 \n\t"
+        "punpckhbw %%mm4, %%mm2 \n\t"
+
+        "movq %%mm1, (%%"REG_D") \n\t"
+        "movq %%mm2, 8(%%"REG_D") \n\t"
+
+        "movq 8(%%"REG_S"), %%mm1 \n\t"
+        "movq 8(%%"REG_S"), %%mm2 \n\t"
+
+        "movq (%%"REG_a",%%"REG_d",2), %%mm4 \n\t"
+        "movq (%%"REG_b",%%"REG_BP",2), %%mm6 \n\t"
+        "punpckhbw %%mm0, %%mm4 \n\t"
+        "punpckhbw %%mm0, %%mm6 \n\t"
+        "movq (%%"REG_a"), %%mm3 \n\t"
+        "movq (%%"REG_b"), %%mm5 \n\t"
+        "punpckhbw %%mm0, %%mm3 \n\t"
+        "punpckhbw %%mm0, %%mm5 \n\t"
+        "movq %%mm4, %%mm7 \n\t"
+        "paddw %%mm4, %%mm4 \n\t"
+        "paddw %%mm7, %%mm4 \n\t"
+        "movq %%mm6, %%mm7 \n\t"
+        "paddw %%mm6, %%mm6 \n\t"
+        "paddw %%mm7, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "paddw %%mm3, %%mm4 \n\t"
+        "paddw %%mm5, %%mm6 \n\t"
+        "psrlw $3, %%mm4 \n\t"
+        "psrlw $3, %%mm6 \n\t"
+        "packuswb %%mm4, %%mm4 \n\t"
+        "packuswb %%mm6, %%mm6 \n\t"
+        "punpcklbw %%mm6, %%mm4 \n\t"
+        "punpcklbw %%mm4, %%mm1 \n\t"
+        "punpckhbw %%mm4, %%mm2 \n\t"
+
+        "add $16, %%"REG_S" \n\t"
+        "add $8, %%"REG_a" \n\t"
+        "add $8, %%"REG_b" \n\t"
+
+        "movq %%mm1, 16(%%"REG_D") \n\t"
+        "movq %%mm2, 24(%%"REG_D") \n\t"
+        "add $32, %%"REG_D" \n\t"
+
+        "decl %%ecx \n\t"
+        "jnz 3b \n\t"
+        "emms \n\t"
+        "pop %%"REG_BP" \n\t"
+        :
+        : "S" (y), "D" (dst), "a" (u), "b" (v), "c" (w/16),
+#if ARCH_X86_64
+        "d" ((x86_reg)us), "r" ((x86_reg)vs)
+#else
+        "d" (&us)
+#endif
+        : "memory"
+        );
+    pack_li_1_C(dst, y, u, v, (w&15), us, vs);
+}
+#endif /* HAVE_EBX_AVAILABLE */
+#endif
+
+static pack_func_t *pack_nn;
+static pack_func_t *pack_li_0;
+static pack_func_t *pack_li_1;
+
+static void ilpack(unsigned char *dst, unsigned char *src[3],
+    int dststride, int srcstride[3], int w, int h, pack_func_t *pack[2])
+{
+    int i;
+    unsigned char *y, *u, *v;
+    int ys = srcstride[0], us = srcstride[1], vs = srcstride[2];
+    int a, b;
+
+    y = src[0];
+    u = src[1];
+    v = src[2];
+
+    pack_nn(dst, y, u, v, w, 0, 0);
+    y += ys; dst += dststride;
+    pack_nn(dst, y, u+us, v+vs, w, 0, 0);
+    y += ys; dst += dststride;
+    for (i=2; i<h-2; i++) {
+        a = (i&2) ? 1 : -1;
+        b = (i&1) ^ ((i&2)>>1);
+        pack[b](dst, y, u, v, w, us*a, vs*a);
+        y += ys;
+        if ((i&3) == 1) {
+            u -= us;
+            v -= vs;
+        } else {
+            u += us;
+            v += vs;
+        }
+        dst += dststride;
+    }
+    pack_nn(dst, y, u, v, w, 0, 0);
+    y += ys; dst += dststride; u += us; v += vs;
+    pack_nn(dst, y, u, v, w, 0, 0);
+}
+
+
+static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
+{
+    mp_image_t *dmpi;
+
+    // hope we'll get DR buffer:
+    dmpi=ff_vf_get_image(vf->next, IMGFMT_YUY2,
+              MP_IMGTYPE_TEMP, MP_IMGFLAG_ACCEPT_STRIDE,
+              mpi->w, mpi->h);
+
+    ilpack(dmpi->planes[0], mpi->planes, dmpi->stride[0], mpi->stride, mpi->w, mpi->h, vf->priv->pack);
+
+    return ff_vf_next_put_image(vf,dmpi, pts);
+}
+
+static int config(struct vf_instance *vf,
+          int width, int height, int d_width, int d_height,
+          unsigned int flags, unsigned int outfmt)
+{
+    /* FIXME - also support UYVY output? */
+    return ff_vf_next_config(vf, width, height, d_width, d_height, flags, IMGFMT_YUY2);
+}
+
+
+static int query_format(struct vf_instance *vf, unsigned int fmt)
+{
+    /* FIXME - really any YUV 4:2:0 input format should work */
+    switch (fmt) {
+    case IMGFMT_YV12:
+    case IMGFMT_IYUV:
+    case IMGFMT_I420:
+        return ff_vf_next_query_format(vf,IMGFMT_YUY2);
+    }
+    return 0;
+}
+
+static int vf_open(vf_instance_t *vf, char *args)
+{
+    vf->config=config;
+    vf->query_format=query_format;
+    vf->put_image=put_image;
+    vf->priv = calloc(1, sizeof(struct vf_priv_s));
+    vf->priv->mode = 1;
+    if (args) sscanf(args, "%d", &vf->priv->mode);
+
+    pack_nn = pack_nn_C;
+    pack_li_0 = pack_li_0_C;
+    pack_li_1 = pack_li_1_C;
+#if HAVE_MMX
+    if(ff_gCpuCaps.hasMMX) {
+        pack_nn = pack_nn_MMX;
+#if HAVE_EBX_AVAILABLE
+        pack_li_0 = pack_li_0_MMX;
+        pack_li_1 = pack_li_1_MMX;
+#endif
+    }
+#endif
+
+    switch(vf->priv->mode) {
+    case 0:
+        vf->priv->pack[0] = vf->priv->pack[1] = pack_nn;
+        break;
+    default:
+        ff_mp_msg(MSGT_VFILTER, MSGL_WARN,
+            "ilpack: unknown mode %d (fallback to linear)\n",
+            vf->priv->mode);
+        /* Fallthrough */
+    case 1:
+        vf->priv->pack[0] = pack_li_0;
+        vf->priv->pack[1] = pack_li_1;
+        break;
+    }
+
+    return 1;
+}
+
+const vf_info_t ff_vf_info_ilpack = {
+    "4:2:0 planar -> 4:2:2 packed reinterlacer",
+    "ilpack",
+    "Richard Felker",
+    "",
+    vf_open,
+    NULL
+};
diff --git a/libavfilter/libmpcodecs/vf_pp7.c b/libavfilter/libmpcodecs/vf_pp7.c
new file mode 100644
index 0000000000..89ed4fe679
--- /dev/null
+++ b/libavfilter/libmpcodecs/vf_pp7.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (C) 2005 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <math.h>
+
+#include "config.h"
+
+#include "mp_msg.h"
+#include "cpudetect.h"
+
+#if HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#include "libavutil/mem.h"
+
+#include "img_format.h"
+#include "mp_image.h"
+#include "vf.h"
+#include "libvo/fastmemcpy.h"
+
+#define XMIN(a,b) ((a) < (b) ? (a) : (b))
+#define XMAX(a,b) ((a) > (b) ? (a) : (b))
+
+//===========================================================================//
+DECLARE_ALIGNED(8, static const uint8_t, dither)[8][8] = {
+{  0,  48,  12,  60,   3,  51,  15,  63, },
+{ 32,  16,  44,  28,  35,  19,  47,  31, },
+{  8,  56,   4,  52,  11,  59,   7,  55, },
+{ 40,  24,  36,  20,  43,  27,  39,  23, },
+{  2,  50,  14,  62,   1,  49,  13,  61, },
+{ 34,  18,  46,  30,  33,  17,  45,  29, },
+{ 10,  58,   6,  54,   9,  57,   5,  53, },
+{ 42,  26,  38,  22,  41,  25,  37,  21, },
+};
+
+struct vf_priv_s {
+    int qp;
+    int mode;
+    int mpeg2;
+    int temp_stride;
+    uint8_t *src;
+};
+#if 0
+static inline void dct7_c(int16_t *dst, int s0, int s1, int s2, int s3, int step){
+    int s, d;
+    int dst2[64];
+//#define S0 (1024/0.37796447300922719759)
+#define C0 ((int)(1024*0.37796447300922719759+0.5)) //sqrt(1/7)
+#define C1 ((int)(1024*0.53452248382484879308/6+0.5)) //sqrt(2/7)/6
+
+#define C2 ((int)(1024*0.45221175985034745004/2+0.5))
+#define C3 ((int)(1024*0.36264567479870879474/2+0.5))
+
+//0.1962505182412941918 0.0149276808419397944-0.2111781990832339584
+#define C4 ((int)(1024*0.1962505182412941918+0.5))
+#define C5 ((int)(1024*0.0149276808419397944+0.5))
+//#define C6 ((int)(1024*0.2111781990832339584+0.5))
+#if 0
+    s= s0 + s1 + s2;
+    dst[0*step] = ((s + s3)*C0 + 512) >> 10;
+    s= (s - 6*s3)*C1 + 512;
+    d= (s0-s2)*C4 + (s1-s2)*C5;
+    dst[1*step] = (s + 2*d)>>10;
+    s -= d;
+    d= (s1-s0)*C2 + (s1-s2)*C3;
+    dst[2*step] = (s + d)>>10;
+    dst[3*step] = (s - d)>>10;
+#elif 1
+    s = s3+s3;
+    s3= s-s0;
+    s0= s+s0;
+    s = s2+s1;
+    s2= s2-s1;
+    dst[0*step]= s0 + s;
+    dst[2*step]= s0 - s;
+    dst[1*step]= 2*s3 +   s2;
+    dst[3*step]=   s3 - 2*s2;
+#else
+    int i,j,n=7;
+    for(i=0; i<7; i+=2){
+        dst2[i*step/2]= 0;
+        for(j=0; j<4; j++)
+            dst2[i*step/2] += src[j*step] * cos(i*M_PI/n*(j+0.5)) * sqrt((i?2.0:1.0)/n);
+        if(fabs(dst2[i*step/2] - dst[i*step/2]) > 20)
+            printf("%d %d %d (%d %d %d %d) -> (%d %d %d %d)\n", i,dst2[i*step/2], dst[i*step/2],src[0*step], src[1*step], src[2*step], src[3*step], dst[0*step], dst[1*step],dst[2*step],dst[3*step]);
+    }
+#endif
+}
+#endif
+
+static inline void dctA_c(int16_t *dst, uint8_t *src, int stride){
+    int i;
+
+    for(i=0; i<4; i++){
+        int s0=  src[0*stride] + src[6*stride];
+        int s1=  src[1*stride] + src[5*stride];
+        int s2=  src[2*stride] + src[4*stride];
+        int s3=  src[3*stride];
+        int s= s3+s3;
+        s3= s-s0;
+        s0= s+s0;
+        s = s2+s1;
+        s2= s2-s1;
+        dst[0]= s0 + s;
+        dst[2]= s0 - s;
+        dst[1]= 2*s3 +   s2;
+        dst[3]=   s3 - 2*s2;
+        src++;
+        dst+=4;
+    }
+}
+
+static void dctB_c(int16_t *dst, int16_t *src){
+    int i;
+
+    for(i=0; i<4; i++){
+        int s0=  src[0*4] + src[6*4];
+        int s1=  src[1*4] + src[5*4];
+        int s2=  src[2*4] + src[4*4];
+        int s3=  src[3*4];
+        int s= s3+s3;
+        s3= s-s0;
+        s0= s+s0;
+        s = s2+s1;
+        s2= s2-s1;
+        dst[0*4]= s0 + s;
+        dst[2*4]= s0 - s;
+        dst[1*4]= 2*s3 +   s2;
+        dst[3*4]=   s3 - 2*s2;
+        src++;
+        dst++;
+    }
+}
+
+#if HAVE_MMX
+static void dctB_mmx(int16_t *dst, int16_t *src){
+    __asm__ volatile (
+        "movq  (%0), %%mm0      \n\t"
+        "movq  1*4*2(%0), %%mm1 \n\t"
+        "paddw 6*4*2(%0), %%mm0 \n\t"
+        "paddw 5*4*2(%0), %%mm1 \n\t"
+        "movq  2*4*2(%0), %%mm2 \n\t"
+        "movq  3*4*2(%0), %%mm3 \n\t"
+        "paddw 4*4*2(%0), %%mm2 \n\t"
+        "paddw %%mm3, %%mm3     \n\t" //s
+        "movq %%mm3, %%mm4      \n\t" //s
+        "psubw %%mm0, %%mm3     \n\t" //s-s0
+        "paddw %%mm0, %%mm4     \n\t" //s+s0
+        "movq %%mm2, %%mm0      \n\t" //s2
+        "psubw %%mm1, %%mm2     \n\t" //s2-s1
+        "paddw %%mm1, %%mm0     \n\t" //s2+s1
+        "movq %%mm4, %%mm1      \n\t" //s0'
+        "psubw %%mm0, %%mm4     \n\t" //s0'-s'
+        "paddw %%mm0, %%mm1     \n\t" //s0'+s'
+        "movq %%mm3, %%mm0      \n\t" //s3'
+        "psubw %%mm2, %%mm3     \n\t"
+        "psubw %%mm2, %%mm3     \n\t"
+        "paddw %%mm0, %%mm2     \n\t"
+        "paddw %%mm0, %%mm2     \n\t"
+        "movq %%mm1, (%1)       \n\t"
+        "movq %%mm4, 2*4*2(%1)  \n\t"
+        "movq %%mm2, 1*4*2(%1)  \n\t"
+        "movq %%mm3, 3*4*2(%1)  \n\t"
+        :: "r" (src), "r"(dst)
+    );
+}
+#endif
+
+static void (*dctB)(int16_t *dst, int16_t *src)= dctB_c;
+
+#define N0 4
+#define N1 5
+#define N2 10
+#define SN0 2
+#define SN1 2.2360679775
+#define SN2 3.16227766017
+#define N (1<<16)
+
+static const int factor[16]={
+    N/(N0*N0), N/(N0*N1), N/(N0*N0),N/(N0*N2),
+    N/(N1*N0), N/(N1*N1), N/(N1*N0),N/(N1*N2),
+    N/(N0*N0), N/(N0*N1), N/(N0*N0),N/(N0*N2),
+    N/(N2*N0), N/(N2*N1), N/(N2*N0),N/(N2*N2),
+};
+
+static const int thres[16]={
+    N/(SN0*SN0), N/(SN0*SN2), N/(SN0*SN0),N/(SN0*SN2),
+    N/(SN2*SN0), N/(SN2*SN2), N/(SN2*SN0),N/(SN2*SN2),
+    N/(SN0*SN0), N/(SN0*SN2), N/(SN0*SN0),N/(SN0*SN2),
+    N/(SN2*SN0), N/(SN2*SN2), N/(SN2*SN0),N/(SN2*SN2),
+};
+
+static int thres2[99][16];
+
+static void init_thres2(void){
+    int qp, i;
+    int bias= 0; //FIXME
+
+    for(qp=0; qp<99; qp++){
+        for(i=0; i<16; i++){
+            thres2[qp][i]= ((i&1)?SN2:SN0) * ((i&4)?SN2:SN0) * XMAX(1,qp) * (1<<2) - 1 - bias;
+        }
+    }
+}
+
+static int hardthresh_c(int16_t *src, int qp){
+    int i;
+    int a;
+
+    a= src[0] * factor[0];
+    for(i=1; i<16; i++){
+        unsigned int threshold1= thres2[qp][i];
+        unsigned int threshold2= (threshold1<<1);
+        int level= src[i];
+        if(((unsigned)(level+threshold1))>threshold2){
+            a += level * factor[i];
+        }
+    }
+    return (a + (1<<11))>>12;
+}
+
+static int mediumthresh_c(int16_t *src, int qp){
+    int i;
+    int a;
+
+    a= src[0] * factor[0];
+    for(i=1; i<16; i++){
+        unsigned int threshold1= thres2[qp][i];
+        unsigned int threshold2= (threshold1<<1);
+        int level= src[i];
+        if(((unsigned)(level+threshold1))>threshold2){
+            if(((unsigned)(level+2*threshold1))>2*threshold2){
+                a += level * factor[i];
+            }else{
+                if(level>0) a+= 2*(level - (int)threshold1)*factor[i];
+                else        a+= 2*(level + (int)threshold1)*factor[i];
+            }
+        }
+    }
+    return (a + (1<<11))>>12;
+}
+
+static int softthresh_c(int16_t *src, int qp){
+    int i;
+    int a;
+
+    a= src[0] * factor[0];
+    for(i=1; i<16; i++){
+        unsigned int threshold1= thres2[qp][i];
+        unsigned int threshold2= (threshold1<<1);
+        int level= src[i];
+        if(((unsigned)(level+threshold1))>threshold2){
+            if(level>0) a+= (level - (int)threshold1)*factor[i];
+            else        a+= (level + (int)threshold1)*factor[i];
+        }
+    }
+    return (a + (1<<11))>>12;
+}
+
+static int (*requantize)(int16_t *src, int qp)= hardthresh_c;
+
+static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int width, int height, uint8_t *qp_store, int qp_stride, int is_luma){
+    int x, y;
+    const int stride= is_luma ? p->temp_stride : ((width+16+15)&(~15));
+    uint8_t  *p_src= p->src + 8*stride;
+    int16_t *block= (int16_t *)p->src;
+    int16_t *temp= (int16_t *)(p->src + 32);
+
+    if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
+    for(y=0; y<height; y++){
+        int index= 8 + 8*stride + y*stride;
+        fast_memcpy(p_src + index, src + y*src_stride, width);
+        for(x=0; x<8; x++){
+            p_src[index         - x - 1]= p_src[index +         x    ];
+            p_src[index + width + x    ]= p_src[index + width - x - 1];
+        }
+    }
+    for(y=0; y<8; y++){
+        fast_memcpy(p_src + (       7-y)*stride, p_src + (       y+8)*stride, stride);
+        fast_memcpy(p_src + (height+8+y)*stride, p_src + (height-y+7)*stride, stride);
+    }
+    //FIXME (try edge emu)
+
+    for(y=0; y<height; y++){
+        for(x=-8; x<0; x+=4){
+            const int index= x + y*stride + (8-3)*(1+stride) + 8; //FIXME silly offset
+            uint8_t *src  = p_src + index;
+            int16_t *tp= temp+4*x;
+
+            dctA_c(tp+4*8, src, stride);
+        }
+        for(x=0; x<width; ){
+            const int qps= 3 + is_luma;
+            int qp;
+            int end= XMIN(x+8, width);
+
+            if(p->qp)
+                qp= p->qp;
+            else{
+                qp= qp_store[ (XMIN(x, width-1)>>qps) + (XMIN(y, height-1)>>qps) * qp_stride];
+                qp=norm_qscale(qp, p->mpeg2);
+            }
+            for(; x<end; x++){
+                const int index= x + y*stride + (8-3)*(1+stride) + 8; //FIXME silly offset
+                uint8_t *src  = p_src + index;
+                int16_t *tp= temp+4*x;
+                int v;
+
+                if((x&3)==0)
+                    dctA_c(tp+4*8, src, stride);
+
+                dctB(block, tp);
+
+                v= requantize(block, qp);
+                v= (v + dither[y&7][x&7])>>6;
+                if((unsigned)v > 255)
+                    v= (-v)>>31;
+                dst[x + y*dst_stride]= v;
+            }
+        }
+    }
+}
+
+static int config(struct vf_instance *vf,
+    int width, int height, int d_width, int d_height,
+    unsigned int flags, unsigned int outfmt){
+    int h= (height+16+15)&(~15);
+
+    vf->priv->temp_stride= (width+16+15)&(~15);
+    vf->priv->src = av_malloc(vf->priv->temp_stride*(h+8)*sizeof(uint8_t));
+
+    return ff_vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
+}
+
+static void get_image(struct vf_instance *vf, mp_image_t *mpi){
+    if(mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
+    // ok, we can do pp in-place (or pp disabled):
+    vf->dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
+        mpi->type, mpi->flags | MP_IMGFLAG_READABLE, mpi->width, mpi->height);
+    mpi->planes[0]=vf->dmpi->planes[0];
+    mpi->stride[0]=vf->dmpi->stride[0];
+    mpi->width=vf->dmpi->width;
+    if(mpi->flags&MP_IMGFLAG_PLANAR){
+        mpi->planes[1]=vf->dmpi->planes[1];
+        mpi->planes[2]=vf->dmpi->planes[2];
+        mpi->stride[1]=vf->dmpi->stride[1];
+        mpi->stride[2]=vf->dmpi->stride[2];
+    }
+    mpi->flags|=MP_IMGFLAG_DIRECT;
+}
+
+static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts){
+    mp_image_t *dmpi;
+
+    if(mpi->flags&MP_IMGFLAG_DIRECT){
+        dmpi=vf->dmpi;
+    }else{
+        // no DR, so get a new image! hope we'll get DR buffer:
+        dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
+            MP_IMGTYPE_TEMP,
+            MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
+            mpi->width,mpi->height);
+        ff_vf_clone_mpi_attributes(dmpi, mpi);
+    }
+
+    vf->priv->mpeg2= mpi->qscale_type;
+    if(mpi->qscale || vf->priv->qp){
+        filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0], mpi->w, mpi->h, mpi->qscale, mpi->qstride, 1);
+        filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, mpi->qscale, mpi->qstride, 0);
+        filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, mpi->qscale, mpi->qstride, 0);
+    }else{
+        memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
+        memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
+        memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
+    }
+
+#if HAVE_MMX
+    if(ff_gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
+#endif
+#if HAVE_MMX2
+    if(ff_gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
+#endif
+
+    return ff_vf_next_put_image(vf,dmpi, pts);
+}
+
+static void uninit(struct vf_instance *vf){
+    if(!vf->priv) return;
+
+    av_free(vf->priv->src);
+    vf->priv->src= NULL;
+
+    free(vf->priv);
+    vf->priv=NULL;
+}
+
+//===========================================================================//
+static int query_format(struct vf_instance *vf, unsigned int fmt){
+    switch(fmt){
+    case IMGFMT_YVU9:
+    case IMGFMT_IF09:
+    case IMGFMT_YV12:
+    case IMGFMT_I420:
+    case IMGFMT_IYUV:
+    case IMGFMT_CLPL:
+    case IMGFMT_Y800:
+    case IMGFMT_Y8:
+    case IMGFMT_444P:
+    case IMGFMT_422P:
+    case IMGFMT_411P:
+        return ff_vf_next_query_format(vf,fmt);
+    }
+    return 0;
+}
+
+static int control(struct vf_instance *vf, int request, void* data){
+    return ff_vf_next_control(vf,request,data);
+}
+
+static int vf_open(vf_instance_t *vf, char *args){
+    vf->config=config;
+    vf->put_image=put_image;
+    vf->get_image=get_image;
+    vf->query_format=query_format;
+    vf->uninit=uninit;
+    vf->control= control;
+    vf->priv=malloc(sizeof(struct vf_priv_s));
+    memset(vf->priv, 0, sizeof(struct vf_priv_s));
+
+    if (args) sscanf(args, "%d:%d", &vf->priv->qp, &vf->priv->mode);
+
+    if(vf->priv->qp < 0)
+        vf->priv->qp = 0;
+
+    init_thres2();
+
+    switch(vf->priv->mode){
+        case 0: requantize= hardthresh_c; break;
+        case 1: requantize= softthresh_c; break;
+        default:
+        case 2: requantize= mediumthresh_c; break;
+    }
+
+#if HAVE_MMX
+    if(ff_gCpuCaps.hasMMX){
+        dctB= dctB_mmx;
+    }
+#endif
+#if 0
+    if(ff_gCpuCaps.hasMMX){
+        switch(vf->priv->mode){
+            case 0: requantize= hardthresh_mmx; break;
+            case 1: requantize= softthresh_mmx; break;
+        }
+    }
+#endif
+
+    return 1;
+}
+
+const vf_info_t ff_vf_info_pp7 = {
+    "postprocess 7",
+    "pp7",
+    "Michael Niedermayer",
+    "",
+    vf_open,
+    NULL
+};
diff --git a/libavfilter/libmpcodecs/vf_softpulldown.c b/libavfilter/libmpcodecs/vf_softpulldown.c
new file mode 100644
index 0000000000..556374eb06
--- /dev/null
+++ b/libavfilter/libmpcodecs/vf_softpulldown.c
@@ -0,0 +1,163 @@
+/*
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config.h"
+#include "mp_msg.h"
+
+#include "img_format.h"
+#include "mp_image.h"
+#include "vf.h"
+
+#include "libvo/fastmemcpy.h"
+
+struct vf_priv_s {
+    int state;
+    long long in;
+    long long out;
+};
+
+static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
+{
+    mp_image_t *dmpi;
+    int ret = 0;
+    int flags = mpi->fields;
+    int state = vf->priv->state;
+
+    dmpi = ff_vf_get_image(vf->next, mpi->imgfmt,
+                        MP_IMGTYPE_STATIC, MP_IMGFLAG_ACCEPT_STRIDE |
+                        MP_IMGFLAG_PRESERVE, mpi->width, mpi->height);
+
+    vf->priv->in++;
+
+    if ((state == 0 &&
+         !(flags & MP_IMGFIELD_TOP_FIRST)) ||
+        (state == 1 &&
+         flags & MP_IMGFIELD_TOP_FIRST)) {
+        ff_mp_msg(MSGT_VFILTER, MSGL_WARN,
+               "softpulldown: Unexpected field flags: state=%d top_field_first=%d repeat_first_field=%d\n",
+               state,
+               (flags & MP_IMGFIELD_TOP_FIRST) != 0,
+               (flags & MP_IMGFIELD_REPEAT_FIRST) != 0);
+        state ^= 1;
+    }
+
+    if (state == 0) {
+        ret = ff_vf_next_put_image(vf, mpi, MP_NOPTS_VALUE);
+        vf->priv->out++;
+        if (flags & MP_IMGFIELD_REPEAT_FIRST) {
+            my_memcpy_pic(dmpi->planes[0],
+                       mpi->planes[0], mpi->w, mpi->h/2,
+                       dmpi->stride[0]*2, mpi->stride[0]*2);
+            if (mpi->flags & MP_IMGFLAG_PLANAR) {
+                my_memcpy_pic(dmpi->planes[1],
+                              mpi->planes[1],
+                              mpi->chroma_width,
+                              mpi->chroma_height/2,
+                              dmpi->stride[1]*2,
+                              mpi->stride[1]*2);
+                my_memcpy_pic(dmpi->planes[2],
+                              mpi->planes[2],
+                              mpi->chroma_width,
+                              mpi->chroma_height/2,
+                              dmpi->stride[2]*2,
+                              mpi->stride[2]*2);
+            }
+            state=1;
+        }
+    } else {
+        my_memcpy_pic(dmpi->planes[0]+dmpi->stride[0],
+                      mpi->planes[0]+mpi->stride[0], mpi->w, mpi->h/2,
+                      dmpi->stride[0]*2, mpi->stride[0]*2);
+        if (mpi->flags & MP_IMGFLAG_PLANAR) {
+            my_memcpy_pic(dmpi->planes[1]+dmpi->stride[1],
+                          mpi->planes[1]+mpi->stride[1],
+                          mpi->chroma_width, mpi->chroma_height/2,
+                          dmpi->stride[1]*2, mpi->stride[1]*2);
+            my_memcpy_pic(dmpi->planes[2]+dmpi->stride[2],
+                          mpi->planes[2]+mpi->stride[2],
+                          mpi->chroma_width, mpi->chroma_height/2,
+                          dmpi->stride[2]*2, mpi->stride[2]*2);
+        }
+        ret = ff_vf_next_put_image(vf, dmpi, MP_NOPTS_VALUE);
+        vf->priv->out++;
+        if (flags & MP_IMGFIELD_REPEAT_FIRST) {
+            ret |= ff_vf_next_put_image(vf, mpi, MP_NOPTS_VALUE);
+            vf->priv->out++;
+            state=0;
+        } else {
+            my_memcpy_pic(dmpi->planes[0],
+                          mpi->planes[0], mpi->w, mpi->h/2,
+                          dmpi->stride[0]*2, mpi->stride[0]*2);
+            if (mpi->flags & MP_IMGFLAG_PLANAR) {
+                my_memcpy_pic(dmpi->planes[1],
+                              mpi->planes[1],
+                              mpi->chroma_width,
+                              mpi->chroma_height/2,
+                              dmpi->stride[1]*2,
+                              mpi->stride[1]*2);
+                my_memcpy_pic(dmpi->planes[2],
+                              mpi->planes[2],
+                              mpi->chroma_width,
+                              mpi->chroma_height/2,
+                              dmpi->stride[2]*2,
+                              mpi->stride[2]*2);
+            }
+        }
+    }
+
+    vf->priv->state = state;
+
+    return ret;
+}
+
+static int config(struct vf_instance *vf,
+    int width, int height, int d_width, int d_height,
+    unsigned int flags, unsigned int outfmt)
+{
+    return ff_vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
+}
+
+static void uninit(struct vf_instance *vf)
+{
+    ff_mp_msg(MSGT_VFILTER, MSGL_INFO, "softpulldown: %lld frames in, %lld frames out\n", vf->priv->in, vf->priv->out);
+    free(vf->priv);
+}
+
+static int vf_open(vf_instance_t *vf, char *args)
+{
+    vf->config = config;
+    vf->put_image = put_image;
+    vf->uninit = uninit;
+    vf->default_reqs = VFCAP_ACCEPT_STRIDE;
+    vf->priv = calloc(1, sizeof(struct vf_priv_s));
+    vf->priv->state = 0;
+    return 1;
+}
+
+const vf_info_t ff_vf_info_softpulldown = {
+    "mpeg2 soft 3:2 pulldown",
+    "softpulldown",
+    "Tobias Diedrich <ranma+mplayer@tdiedrich.de>",
+    "",
+    vf_open,
+    NULL
+};
diff --git a/libavfilter/libmpcodecs/vf_uspp.c b/libavfilter/libmpcodecs/vf_uspp.c
new file mode 100644
index 0000000000..c9d9c1fd16
--- /dev/null
+++ b/libavfilter/libmpcodecs/vf_uspp.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (C) 2005 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config.h"
+
+#include "mp_msg.h"
+#include "cpudetect.h"
+
+#include "libavutil/mem.h"
+#include "libavcodec/avcodec.h"
+
+#include "img_format.h"
+#include "mp_image.h"
+#include "vf.h"
+#include "av_helpers.h"
+#include "libvo/fastmemcpy.h"
+
+#define XMIN(a,b) ((a) < (b) ? (a) : (b))
+
+#define BLOCK 16
+
+//===========================================================================//
+DECLARE_ALIGNED(8, static const uint8_t, dither)[8][8] = {
+{  0*4,  48*4,  12*4,  60*4,   3*4,  51*4,  15*4,  63*4, },
+{ 32*4,  16*4,  44*4,  28*4,  35*4,  19*4,  47*4,  31*4, },
+{  8*4,  56*4,   4*4,  52*4,  11*4,  59*4,   7*4,  55*4, },
+{ 40*4,  24*4,  36*4,  20*4,  43*4,  27*4,  39*4,  23*4, },
+{  2*4,  50*4,  14*4,  62*4,   1*4,  49*4,  13*4,  61*4, },
+{ 34*4,  18*4,  46*4,  30*4,  33*4,  17*4,  45*4,  29*4, },
+{ 10*4,  58*4,   6*4,  54*4,   9*4,  57*4,   5*4,  53*4, },
+{ 42*4,  26*4,  38*4,  22*4,  41*4,  25*4,  37*4,  21*4, },
+};
+
+static const uint8_t offset[511][2]= {
+{ 0, 0},
+{ 0, 0}, { 8, 8},
+{ 0, 0}, { 4, 4}, {12, 8}, { 8,12},
+{ 0, 0}, {10, 2}, { 4, 4}, {14, 6}, { 8, 8}, { 2,10}, {12,12}, { 6,14},
+
+{ 0, 0}, {10, 2}, { 4, 4}, {14, 6}, { 8, 8}, { 2,10}, {12,12}, { 6,14},
+{ 5, 1}, {15, 3}, { 9, 5}, { 3, 7}, {13, 9}, { 7,11}, { 1,13}, {11,15},
+
+{ 0, 0}, { 8, 0}, { 0, 8}, { 8, 8}, { 5, 1}, {13, 1}, { 5, 9}, {13, 9},
+{ 2, 2}, {10, 2}, { 2,10}, {10,10}, { 7, 3}, {15, 3}, { 7,11}, {15,11},
+{ 4, 4}, {12, 4}, { 4,12}, {12,12}, { 1, 5}, { 9, 5}, { 1,13}, { 9,13},
+{ 6, 6}, {14, 6}, { 6,14}, {14,14}, { 3, 7}, {11, 7}, { 3,15}, {11,15},
+
+{ 0, 0}, { 8, 0}, { 0, 8}, { 8, 8}, { 4, 0}, {12, 0}, { 4, 8}, {12, 8},
+{ 1, 1}, { 9, 1}, { 1, 9}, { 9, 9}, { 5, 1}, {13, 1}, { 5, 9}, {13, 9},
+{ 3, 2}, {11, 2}, { 3,10}, {11,10}, { 7, 2}, {15, 2}, { 7,10}, {15,10},
+{ 2, 3}, {10, 3}, { 2,11}, {10,11}, { 6, 3}, {14, 3}, { 6,11}, {14,11},
+{ 0, 4}, { 8, 4}, { 0,12}, { 8,12}, { 4, 4}, {12, 4}, { 4,12}, {12,12},
+{ 1, 5}, { 9, 5}, { 1,13}, { 9,13}, { 5, 5}, {13, 5}, { 5,13}, {13,13},
+{ 3, 6}, {11, 6}, { 3,14}, {11,14}, { 7, 6}, {15, 6}, { 7,14}, {15,14},
+{ 2, 7}, {10, 7}, { 2,15}, {10,15}, { 6, 7}, {14, 7}, { 6,15}, {14,15},
+
+{ 0, 0}, { 8, 0}, { 0, 8}, { 8, 8}, { 0, 2}, { 8, 2}, { 0,10}, { 8,10},
+{ 0, 4}, { 8, 4}, { 0,12}, { 8,12}, { 0, 6}, { 8, 6}, { 0,14}, { 8,14},
+{ 1, 1}, { 9, 1}, { 1, 9}, { 9, 9}, { 1, 3}, { 9, 3}, { 1,11}, { 9,11},
+{ 1, 5}, { 9, 5}, { 1,13}, { 9,13}, { 1, 7}, { 9, 7}, { 1,15}, { 9,15},
+{ 2, 0}, {10, 0}, { 2, 8}, {10, 8}, { 2, 2}, {10, 2}, { 2,10}, {10,10},
+{ 2, 4}, {10, 4}, { 2,12}, {10,12}, { 2, 6}, {10, 6}, { 2,14}, {10,14},
+{ 3, 1}, {11, 1}, { 3, 9}, {11, 9}, { 3, 3}, {11, 3}, { 3,11}, {11,11},
+{ 3, 5}, {11, 5}, { 3,13}, {11,13}, { 3, 7}, {11, 7}, { 3,15}, {11,15},
+{ 4, 0}, {12, 0}, { 4, 8}, {12, 8}, { 4, 2}, {12, 2}, { 4,10}, {12,10},
+{ 4, 4}, {12, 4}, { 4,12}, {12,12}, { 4, 6}, {12, 6}, { 4,14}, {12,14},
+{ 5, 1}, {13, 1}, { 5, 9}, {13, 9}, { 5, 3}, {13, 3}, { 5,11}, {13,11},
+{ 5, 5}, {13, 5}, { 5,13}, {13,13}, { 5, 7}, {13, 7}, { 5,15}, {13,15},
+{ 6, 0}, {14, 0}, { 6, 8}, {14, 8}, { 6, 2}, {14, 2}, { 6,10}, {14,10},
+{ 6, 4}, {14, 4}, { 6,12}, {14,12}, { 6, 6}, {14, 6}, { 6,14}, {14,14},
+{ 7, 1}, {15, 1}, { 7, 9}, {15, 9}, { 7, 3}, {15, 3}, { 7,11}, {15,11},
+{ 7, 5}, {15, 5}, { 7,13}, {15,13}, { 7, 7}, {15, 7}, { 7,15}, {15,15},
+
+{ 0, 0}, { 8, 0}, { 0, 8}, { 8, 8}, { 4, 4}, {12, 4}, { 4,12}, {12,12}, { 0, 4}, { 8, 4}, { 0,12}, { 8,12}, { 4, 0}, {12, 0}, { 4, 8}, {12, 8}, { 2, 2}, {10, 2}, { 2,10}, {10,10}, { 6, 6}, {14, 6}, { 6,14}, {14,14}, { 2, 6}, {10, 6}, { 2,14}, {10,14}, { 6, 2}, {14, 2}, { 6,10}, {14,10}, { 0, 2}, { 8, 2}, { 0,10}, { 8,10}, { 4, 6}, {12, 6}, { 4,14}, {12,14}, { 0, 6}, { 8, 6}, { 0,14}, { 8,14}, { 4, 2}, {12, 2}, { 4,10}, {12,10}, { 2, 0}, {10, 0}, { 2, 8}, {10, 8}, { 6, 4}, {14, 4}, { 6,12}, {14,12}, { 2, 4}, {10, 4}, { 2,12}, {10,12}, { 6, 0}, {14, 0}, { 6, 8}, {14, 8}, { 1, 1}, { 9, 1}, { 1, 9}, { 9, 9}, { 5, 5}, {13, 5}, { 5,13}, {13,13}, { 1, 5}, { 9, 5}, { 1,13}, { 9,13}, { 5, 1}, {13, 1}, { 5, 9}, {13, 9}, { 3, 3}, {11, 3}, { 3,11}, {11,11}, { 7, 7}, {15, 7}, { 7,15}, {15,15}, { 3, 7}, {11, 7}, { 3,15}, {11,15}, { 7, 3}, {15, 3}, { 7,11}, {15,11}, { 1, 3}, { 9, 3}, { 1,11}, { 9,11}, { 5, 7}, {13, 7}, { 5,15}, {13,15}, { 1, 7}, { 9, 7}, { 1,15}, { 9,15}, { 5, 3}, {13, 3}, { 5,11}, {13,11}, { 3, 1}, {11, 1}
+, { 3, 9}, {11, 9}, { 7, 5}, {15, 5}, { 7,13}, {15,13}, { 3, 5}, {11, 5}, { 3,13}, {11,13}, { 7, 1}, {15, 1}, { 7, 9}, {15, 9}, { 0, 1}, { 8, 1}, { 0, 9}, { 8, 9}, { 4, 5}, {12, 5}, { 4,13}, {12,13}, { 0, 5}, { 8, 5}, { 0,13}, { 8,13}, { 4, 1}, {12, 1}, { 4, 9}, {12, 9}, { 2, 3}, {10, 3}, { 2,11}, {10,11}, { 6, 7}, {14, 7}, { 6,15}, {14,15}, { 2, 7}, {10, 7}, { 2,15}, {10,15}, { 6, 3}, {14, 3}, { 6,11}, {14,11}, { 0, 3}, { 8, 3}, { 0,11}, { 8,11}, { 4, 7}, {12, 7}, { 4,15}, {12,15}, { 0, 7}, { 8, 7}, { 0,15}, { 8,15}, { 4, 3}, {12, 3}, { 4,11}, {12,11}, { 2, 1}, {10, 1}, { 2, 9}, {10, 9}, { 6, 5}, {14, 5}, { 6,13}, {14,13}, { 2, 5}, {10, 5}, { 2,13}, {10,13}, { 6, 1}, {14, 1}, { 6, 9}, {14, 9}, { 1, 0}, { 9, 0}, { 1, 8}, { 9, 8}, { 5, 4}, {13, 4}, { 5,12}, {13,12}, { 1, 4}, { 9, 4}, { 1,12}, { 9,12}, { 5, 0}, {13, 0}, { 5, 8}, {13, 8}, { 3, 2}, {11, 2}, { 3,10}, {11,10}, { 7, 6}, {15, 6}, { 7,14}, {15,14}, { 3, 6}, {11, 6}, { 3,14}, {11,14}, { 7, 2}, {15, 2}, { 7,10}, {15,10}, { 1, 2}, { 9, 2}, { 1,10}, { 9,
+10}, { 5, 6}, {13, 6}, { 5,14}, {13,14}, { 1, 6}, { 9, 6}, { 1,14}, { 9,14}, { 5, 2}, {13, 2}, { 5,10}, {13,10}, { 3, 0}, {11, 0}, { 3, 8}, {11, 8}, { 7, 4}, {15, 4}, { 7,12}, {15,12}, { 3, 4}, {11, 4}, { 3,12}, {11,12}, { 7, 0}, {15, 0}, { 7, 8}, {15, 8},
+};
+
+struct vf_priv_s {
+    int log2_count;
+    int qp;
+    int mode;
+    int mpeg2;
+    int temp_stride[3];
+    uint8_t *src[3];
+    int16_t *temp[3];
+    int outbuf_size;
+    uint8_t *outbuf;
+    AVCodecContext *avctx_enc[BLOCK*BLOCK];
+    AVFrame *frame;
+    AVFrame *frame_dec;
+};
+
+static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale){
+        int y, x;
+
+#define STORE(pos) \
+        temp= ((src[x + y*src_stride + pos]<<log2_scale) + d[pos])>>8;\
+        if(temp & 0x100) temp= ~(temp>>31);\
+        dst[x + y*dst_stride + pos]= temp;
+
+        for(y=0; y<height; y++){
+                const uint8_t *d= dither[y&7];
+                for(x=0; x<width; x+=8){
+                        int temp;
+                        STORE(0);
+                        STORE(1);
+                        STORE(2);
+                        STORE(3);
+                        STORE(4);
+                        STORE(5);
+                        STORE(6);
+                        STORE(7);
+                }
+        }
+}
+
+static void filter(struct vf_priv_s *p, uint8_t *dst[3], uint8_t *src[3], int dst_stride[3], int src_stride[3], int width, int height, uint8_t *qp_store, int qp_stride){
+    int x, y, i, j;
+    const int count= 1<<p->log2_count;
+
+    for(i=0; i<3; i++){
+        int is_chroma= !!i;
+        int w= width >>is_chroma;
+        int h= height>>is_chroma;
+        int stride= p->temp_stride[i];
+        int block= BLOCK>>is_chroma;
+
+        if (!src[i] || !dst[i])
+            continue; // HACK avoid crash for Y8 colourspace
+        for(y=0; y<h; y++){
+            int index= block + block*stride + y*stride;
+            fast_memcpy(p->src[i] + index, src[i] + y*src_stride[i], w);
+            for(x=0; x<block; x++){
+                p->src[i][index     - x - 1]= p->src[i][index +     x    ];
+                p->src[i][index + w + x    ]= p->src[i][index + w - x - 1];
+            }
+        }
+        for(y=0; y<block; y++){
+            fast_memcpy(p->src[i] + (  block-1-y)*stride, p->src[i] + (  y+block  )*stride, stride);
+            fast_memcpy(p->src[i] + (h+block  +y)*stride, p->src[i] + (h-y+block-1)*stride, stride);
+        }
+
+        p->frame->linesize[i]= stride;
+        memset(p->temp[i], 0, (h+2*block)*stride*sizeof(int16_t));
+    }
+
+    if(p->qp)
+        p->frame->quality= p->qp * FF_QP2LAMBDA;
+    else
+        p->frame->quality= norm_qscale(qp_store[0], p->mpeg2) * FF_QP2LAMBDA;
+//    init per MB qscale stuff FIXME
+
+    for(i=0; i<count; i++){
+        const int x1= offset[i+count-1][0];
+        const int y1= offset[i+count-1][1];
+        int offset;
+        p->frame->data[0]= p->src[0] + x1 + y1 * p->frame->linesize[0];
+        p->frame->data[1]= p->src[1] + x1/2 + y1/2 * p->frame->linesize[1];
+        p->frame->data[2]= p->src[2] + x1/2 + y1/2 * p->frame->linesize[2];
+
+        avcodec_encode_video(p->avctx_enc[i], p->outbuf, p->outbuf_size, p->frame);
+        p->frame_dec = p->avctx_enc[i]->coded_frame;
+
+        offset= (BLOCK-x1) + (BLOCK-y1)*p->frame_dec->linesize[0];
+        //FIXME optimize
+        for(y=0; y<height; y++){
+            for(x=0; x<width; x++){
+                p->temp[0][ x + y*p->temp_stride[0] ] += p->frame_dec->data[0][ x + y*p->frame_dec->linesize[0] + offset ];
+            }
+        }
+        offset= (BLOCK/2-x1/2) + (BLOCK/2-y1/2)*p->frame_dec->linesize[1];
+        for(y=0; y<height/2; y++){
+            for(x=0; x<width/2; x++){
+                p->temp[1][ x + y*p->temp_stride[1] ] += p->frame_dec->data[1][ x + y*p->frame_dec->linesize[1] + offset ];
+                p->temp[2][ x + y*p->temp_stride[2] ] += p->frame_dec->data[2][ x + y*p->frame_dec->linesize[2] + offset ];
+            }
+        }
+    }
+
+    for(j=0; j<3; j++){
+        int is_chroma= !!j;
+        if (!dst[j])
+            continue; // HACK avoid crash for Y8 colourspace
+        store_slice_c(dst[j], p->temp[j], dst_stride[j], p->temp_stride[j], width>>is_chroma, height>>is_chroma, 8-p->log2_count);
+    }
+}
+
+static int config(struct vf_instance *vf,
+        int width, int height, int d_width, int d_height,
+        unsigned int flags, unsigned int outfmt){
+        int i;
+        AVCodec *enc= avcodec_find_encoder(AV_CODEC_ID_SNOW);
+
+        for(i=0; i<3; i++){
+            int is_chroma= !!i;
+            int w= ((width  + 4*BLOCK-1) & (~(2*BLOCK-1)))>>is_chroma;
+            int h= ((height + 4*BLOCK-1) & (~(2*BLOCK-1)))>>is_chroma;
+
+            vf->priv->temp_stride[i]= w;
+            vf->priv->temp[i]= malloc(vf->priv->temp_stride[i]*h*sizeof(int16_t));
+            vf->priv->src [i]= malloc(vf->priv->temp_stride[i]*h*sizeof(uint8_t));
+        }
+        for(i=0; i< (1<<vf->priv->log2_count); i++){
+            AVCodecContext *avctx_enc;
+            AVDictionary *opts = NULL;
+
+            avctx_enc=
+            vf->priv->avctx_enc[i]= avcodec_alloc_context3(NULL);
+            avctx_enc->width = width + BLOCK;
+            avctx_enc->height = height + BLOCK;
+            avctx_enc->time_base= (AVRational){1,25};  // meaningless
+            avctx_enc->gop_size = 300;
+            avctx_enc->max_b_frames= 0;
+            avctx_enc->pix_fmt = AV_PIX_FMT_YUV420P;
+            avctx_enc->flags = CODEC_FLAG_QSCALE | CODEC_FLAG_LOW_DELAY;
+            avctx_enc->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
+            avctx_enc->global_quality= 123;
+            av_dict_set(&opts, "no_bitstream", "1", 0);
+            if (avcodec_open2(avctx_enc, enc, &opts) < 0)
+                return 0;
+            av_dict_free(&opts);
+            assert(avctx_enc->codec);
+        }
+        vf->priv->frame= av_frame_alloc();
+        vf->priv->frame_dec= av_frame_alloc();
+
+        vf->priv->outbuf_size= (width + BLOCK)*(height + BLOCK)*10;
+        vf->priv->outbuf= malloc(vf->priv->outbuf_size);
+
+        return ff_vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
+}
+
+static void get_image(struct vf_instance *vf, mp_image_t *mpi){
+    if(mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
+    // ok, we can do pp in-place (or pp disabled):
+    vf->dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
+        mpi->type, mpi->flags | MP_IMGFLAG_READABLE, mpi->width, mpi->height);
+    mpi->planes[0]=vf->dmpi->planes[0];
+    mpi->stride[0]=vf->dmpi->stride[0];
+    mpi->width=vf->dmpi->width;
+    if(mpi->flags&MP_IMGFLAG_PLANAR){
+        mpi->planes[1]=vf->dmpi->planes[1];
+        mpi->planes[2]=vf->dmpi->planes[2];
+        mpi->stride[1]=vf->dmpi->stride[1];
+        mpi->stride[2]=vf->dmpi->stride[2];
+    }
+    mpi->flags|=MP_IMGFLAG_DIRECT;
+}
+
+static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts){
+    mp_image_t *dmpi;
+
+    if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
+        // no DR, so get a new image! hope we'll get DR buffer:
+        dmpi=ff_vf_get_image(vf->next,mpi->imgfmt,
+            MP_IMGTYPE_TEMP,
+            MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
+            mpi->width,mpi->height);
+        ff_vf_clone_mpi_attributes(dmpi, mpi);
+    }else{
+        dmpi=vf->dmpi;
+    }
+
+    vf->priv->mpeg2= mpi->qscale_type;
+    if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
+        if(mpi->qscale || vf->priv->qp){
+            filter(vf->priv, dmpi->planes, mpi->planes, dmpi->stride, mpi->stride, mpi->w, mpi->h, mpi->qscale, mpi->qstride);
+        }else{
+            memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
+            memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
+            memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
+        }
+    }
+
+#if HAVE_MMX
+    if(ff_gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
+#endif
+#if HAVE_MMX2
+    if(ff_gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
+#endif
+
+    return ff_vf_next_put_image(vf,dmpi, pts);
+}
+
+static void uninit(struct vf_instance *vf){
+    int i;
+    if(!vf->priv) return;
+
+    for(i=0; i<3; i++){
+        free(vf->priv->temp[i]);
+        vf->priv->temp[i]= NULL;
+        free(vf->priv->src[i]);
+        vf->priv->src[i]= NULL;
+    }
+    for(i=0; i<BLOCK*BLOCK; i++){
+        av_freep(&vf->priv->avctx_enc[i]);
+    }
+
+    free(vf->priv);
+    vf->priv=NULL;
+}
+
+//===========================================================================//
+static int query_format(struct vf_instance *vf, unsigned int fmt){
+    switch(fmt){
+        case IMGFMT_YV12:
+        case IMGFMT_I420:
+        case IMGFMT_IYUV:
+        case IMGFMT_Y800:
+        case IMGFMT_Y8:
+            return ff_vf_next_query_format(vf,fmt);
+    }
+    return 0;
+}
+
+static int control(struct vf_instance *vf, int request, void* data){
+    switch(request){
+    case VFCTRL_QUERY_MAX_PP_LEVEL:
+        return 8;
+    case VFCTRL_SET_PP_LEVEL:
+        vf->priv->log2_count= *((unsigned int*)data);
+        //FIXME we have to realloc a few things here
+        return CONTROL_TRUE;
+    }
+    return ff_vf_next_control(vf,request,data);
+}
+
+static int vf_open(vf_instance_t *vf, char *args){
+
+    int log2c=-1;
+
+    vf->config=config;
+    vf->put_image=put_image;
+    vf->get_image=get_image;
+    vf->query_format=query_format;
+    vf->uninit=uninit;
+    vf->control= control;
+    vf->priv=malloc(sizeof(struct vf_priv_s));
+    memset(vf->priv, 0, sizeof(struct vf_priv_s));
+
+    ff_init_avcodec();
+
+    vf->priv->log2_count= 4;
+
+    if (args) sscanf(args, "%d:%d:%d", &log2c, &vf->priv->qp, &vf->priv->mode);
+
+    if( log2c >=0 && log2c <=8 )
+        vf->priv->log2_count = log2c;
+
+    if(vf->priv->qp < 0)
+        vf->priv->qp = 0;
+
+// #if HAVE_MMX
+//     if(ff_gCpuCaps.hasMMX){
+//         store_slice= store_slice_mmx;
+//     }
+// #endif
+
+    return 1;
+}
+
+const vf_info_t ff_vf_info_uspp = {
+    "ultra simple/slow postprocess",
+    "uspp",
+    "Michael Niedermayer",
+    "",
+    vf_open,
+    NULL
+};
diff --git a/libavfilter/libmpcodecs/vfcap.h b/libavfilter/libmpcodecs/vfcap.h
new file mode 100644
index 0000000000..611d642869
--- /dev/null
+++ b/libavfilter/libmpcodecs/vfcap.h
@@ -0,0 +1,56 @@
+/* VFCAP_* values: they are flags, returned by query_format():
+ *
+ * This file is part of MPlayer.
+ *
+ * MPlayer is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * MPlayer is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with MPlayer; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef MPLAYER_VFCAP_H
+#define MPLAYER_VFCAP_H
+
+// set, if the given colorspace is supported (with or without conversion)
+#define VFCAP_CSP_SUPPORTED 0x1
+// set, if the given colorspace is supported _without_ conversion
+#define VFCAP_CSP_SUPPORTED_BY_HW 0x2
+// set if the driver/filter can draw OSD
+#define VFCAP_OSD 0x4
+// set if the driver/filter can handle compressed SPU stream
+#define VFCAP_SPU 0x8
+// scaling up/down by hardware, or software:
+#define VFCAP_HWSCALE_UP 0x10
+#define VFCAP_HWSCALE_DOWN 0x20
+#define VFCAP_SWSCALE 0x40
+// driver/filter can do vertical flip (upside-down)
+#define VFCAP_FLIP 0x80
+
+// driver/hardware handles timing (blocking)
+#define VFCAP_TIMER 0x100
+// driver _always_ flip image upside-down (for ve_vfw)
+#define VFCAP_FLIPPED 0x200
+// vf filter: accepts stride (put_image)
+// vo driver: has draw_slice() support for the given csp
+#define VFCAP_ACCEPT_STRIDE 0x400
+// filter does postprocessing (so you shouldn't scale/filter image before it)
+#define VFCAP_POSTPROC 0x800
+// filter cannot be reconfigured to different size & format
+#define VFCAP_CONSTANT 0x1000
+// filter can draw EOSD
+#define VFCAP_EOSD 0x2000
+// filter will draw EOSD at screen resolution (without scaling)
+#define VFCAP_EOSD_UNSCALED 0x4000
+// used by libvo and vf_vo, indicates the VO does not support draw_slice for this format
+#define VOCAP_NOSLICES 0x8000
+
+#endif /* MPLAYER_VFCAP_H */