From e3fcb14347466095839c2a3c47ebecff02da891e Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Fri, 24 Jan 2014 11:55:16 +0100
Subject: dsputil: Split off IDCT bits into their own context

---
 configure                              |  40 +++---
 doc/optimization.txt                   |   3 -
 libavcodec/Makefile                    |   5 +-
 libavcodec/aic.c                       |  23 ++--
 libavcodec/arm/Makefile                |  23 ++--
 libavcodec/arm/dsputil_arm.S           | 120 ----------------
 libavcodec/arm/dsputil_arm.h           |   4 -
 libavcodec/arm/dsputil_armv6.S         |  27 ----
 libavcodec/arm/dsputil_init_arm.c      |  60 --------
 libavcodec/arm/dsputil_init_armv5te.c  |  43 ------
 libavcodec/arm/dsputil_init_armv6.c    |  18 ---
 libavcodec/arm/dsputil_init_neon.c     |  53 -------
 libavcodec/arm/dsputil_neon.S          | 128 -----------------
 libavcodec/arm/idctdsp_arm.S           | 120 ++++++++++++++++
 libavcodec/arm/idctdsp_arm.h           |  34 +++++
 libavcodec/arm/idctdsp_armv6.S         |  48 +++++++
 libavcodec/arm/idctdsp_init_arm.c      |  98 +++++++++++++
 libavcodec/arm/idctdsp_init_armv5te.c  |  43 ++++++
 libavcodec/arm/idctdsp_init_armv6.c    |  48 +++++++
 libavcodec/arm/idctdsp_init_neon.c     |  53 +++++++
 libavcodec/arm/idctdsp_neon.S          | 128 +++++++++++++++++
 libavcodec/asv.c                       |   1 -
 libavcodec/asv.h                       |   2 +
 libavcodec/asvdec.c                    |  16 ++-
 libavcodec/asvenc.c                    |   1 +
 libavcodec/cavs.c                      |   7 +-
 libavcodec/cavs.h                      |   4 +-
 libavcodec/cavsdsp.c                   |   2 +-
 libavcodec/dnxhddec.c                  |  52 +++----
 libavcodec/dnxhdenc.c                  |   9 +-
 libavcodec/dsputil.c                   | 162 +---------------------
 libavcodec/dsputil.h                   |  66 ---------
 libavcodec/dvdec.c                     |   9 +-
 libavcodec/dxva2_mpeg2.c               |   2 +-
 libavcodec/eamad.c                     |   9 +-
 libavcodec/eatgq.c                     |   2 +-
 libavcodec/eatqi.c                     |   7 +-
 libavcodec/g2meet.c                    |  12 +-
 libavcodec/h263.c                      |   8 +-
 libavcodec/idctdsp.c                   | 197 ++++++++++++++++++++++++++
 libavcodec/idctdsp.h                   | 104 ++++++++++++++
 libavcodec/intrax8.c                   |  15 +-
 libavcodec/ljpegenc.c                  |   9 +-
 libavcodec/mdec.c                      |  20 +--
 libavcodec/mimic.c                     |  10 +-
 libavcodec/mjpegdec.c                  |  10 +-
 libavcodec/mjpegdec.h                  |   4 +-
 libavcodec/mjpegenc_common.c           |   2 +-
 libavcodec/mjpegenc_common.h           |   2 +-
 libavcodec/mpeg12dec.c                 |  30 ++--
 libavcodec/mpeg4videodec.c             |  39 +++---
 libavcodec/mpeg4videoenc.c             |  28 ++--
 libavcodec/mpegvideo.c                 |  52 +++----
 libavcodec/mpegvideo.h                 |   2 +
 libavcodec/mpegvideo_enc.c             |  20 +--
 libavcodec/mpegvideo_xvmc.c            |   2 +-
 libavcodec/msmpeg4.c                   |  10 +-
 libavcodec/nuv.c                       |   1 +
 libavcodec/ppc/Makefile                |   2 +-
 libavcodec/ppc/dsputil_altivec.h       |   3 -
 libavcodec/ppc/dsputil_ppc.c           |   6 -
 libavcodec/ppc/idct_altivec.c          | 221 -----------------------------
 libavcodec/ppc/idctdsp.c               | 245 +++++++++++++++++++++++++++++++++
 libavcodec/proresdec.c                 |   2 +-
 libavcodec/proresdsp.c                 |   2 +-
 libavcodec/rtjpeg.c                    |   8 +-
 libavcodec/rtjpeg.h                    |   5 +-
 libavcodec/vc1dec.c                    | 116 +++++++++-------
 libavcodec/wmv2.c                      |  23 ++--
 libavcodec/wmv2dsp.c                   |   2 +-
 libavcodec/x86/Makefile                |  10 +-
 libavcodec/x86/cavsdsp.c               |   3 +-
 libavcodec/x86/dsputil_init.c          |  85 ------------
 libavcodec/x86/dsputil_mmx.c           | 135 ------------------
 libavcodec/x86/dsputil_x86.h           |   7 -
 libavcodec/x86/idct_mmx_xvid.c         |   2 +-
 libavcodec/x86/idct_sse2_xvid.c        |   2 +-
 libavcodec/x86/idctdsp.h               |  31 +++++
 libavcodec/x86/idctdsp_init.c          | 106 ++++++++++++++
 libavcodec/x86/idctdsp_mmx.c           | 168 ++++++++++++++++++++++
 libavcodec/x86/mpegvideoenc_template.c |   2 +-
 libavcodec/x86/proresdsp_init.c        |   2 +-
 libavcodec/x86/simple_idct.c           |   2 +-
 83 files changed, 1788 insertions(+), 1449 deletions(-)
 delete mode 100644 libavcodec/arm/dsputil_arm.S
 delete mode 100644 libavcodec/arm/dsputil_init_armv5te.c
 delete mode 100644 libavcodec/arm/dsputil_init_neon.c
 delete mode 100644 libavcodec/arm/dsputil_neon.S
 create mode 100644 libavcodec/arm/idctdsp_arm.S
 create mode 100644 libavcodec/arm/idctdsp_arm.h
 create mode 100644 libavcodec/arm/idctdsp_armv6.S
 create mode 100644 libavcodec/arm/idctdsp_init_arm.c
 create mode 100644 libavcodec/arm/idctdsp_init_armv5te.c
 create mode 100644 libavcodec/arm/idctdsp_init_armv6.c
 create mode 100644 libavcodec/arm/idctdsp_init_neon.c
 create mode 100644 libavcodec/arm/idctdsp_neon.S
 create mode 100644 libavcodec/idctdsp.c
 create mode 100644 libavcodec/idctdsp.h
 delete mode 100644 libavcodec/ppc/idct_altivec.c
 create mode 100644 libavcodec/ppc/idctdsp.c
 create mode 100644 libavcodec/x86/idctdsp.h
 create mode 100644 libavcodec/x86/idctdsp_init.c
 create mode 100644 libavcodec/x86/idctdsp_mmx.c

diff --git a/configure b/configure
index 7ea15aa9f7..be97868031 100755
--- a/configure
+++ b/configure
@@ -1546,6 +1546,7 @@ CONFIG_EXTRA="
     huffman
     huffyuvdsp
     huffyuvencdsp
+    idctdsp
     intrax8
     lgplv3
     lpc
@@ -1703,6 +1704,7 @@ threads_if_any="$THREADS_LIST"
 
 # subsystems
 dct_select="rdft"
+dsputil_select="idctdsp"
 error_resilience_select="dsputil"
 intrax8_select="error_resilience"
 mdct_select="fft"
@@ -1710,7 +1712,7 @@ rdft_select="fft"
 mpeg_er_select="error_resilience"
 mpegaudio_select="mpegaudiodsp"
 mpegaudiodsp_select="dct"
-mpegvideo_select="blockdsp dsputil hpeldsp videodsp"
+mpegvideo_select="blockdsp dsputil hpeldsp idctdsp videodsp"
 mpegvideoenc_select="dsputil mpegvideo qpeldsp"
 
 # decoders / encoders
@@ -1720,16 +1722,16 @@ aac_latm_decoder_select="aac_decoder aac_latm_parser"
 ac3_decoder_select="ac3_parser ac3dsp bswapdsp mdct"
 ac3_encoder_select="ac3dsp audiodsp dsputil mdct"
 ac3_fixed_encoder_select="ac3dsp audiodsp dsputil mdct"
-aic_decoder_select="dsputil golomb"
+aic_decoder_select="golomb idctdsp"
 alac_encoder_select="lpc"
 als_decoder_select="bswapdsp"
 amrnb_decoder_select="lsp"
 amrwb_decoder_select="lsp"
 amv_decoder_select="sp5x_decoder"
 ape_decoder_select="bswapdsp"
-asv1_decoder_select="blockdsp bswapdsp dsputil"
+asv1_decoder_select="blockdsp bswapdsp idctdsp"
 asv1_encoder_select="bswapdsp dsputil"
-asv2_decoder_select="blockdsp bswapdsp dsputil"
+asv2_decoder_select="blockdsp bswapdsp idctdsp"
 asv2_encoder_select="bswapdsp dsputil"
 atrac1_decoder_select="mdct sinewin"
 atrac3_decoder_select="mdct"
@@ -1737,23 +1739,23 @@ atrac3p_decoder_select="mdct sinewin"
 bink_decoder_select="blockdsp hpeldsp"
 binkaudio_dct_decoder_select="mdct rdft dct sinewin"
 binkaudio_rdft_decoder_select="mdct rdft sinewin"
-cavs_decoder_select="blockdsp dsputil golomb h264chroma qpeldsp videodsp"
+cavs_decoder_select="blockdsp golomb h264chroma idctdsp qpeldsp videodsp"
 cllc_decoder_select="bswapdsp"
 comfortnoise_encoder_select="lpc"
 cook_decoder_select="audiodsp mdct sinewin"
 cscd_decoder_select="lzo"
 cscd_decoder_suggest="zlib"
 dca_decoder_select="mdct"
-dnxhd_decoder_select="blockdsp dsputil"
-dnxhd_encoder_select="aandcttables blockdsp dsputil mpegvideoenc"
-dvvideo_decoder_select="dsputil"
+dnxhd_decoder_select="blockdsp idctdsp"
+dnxhd_encoder_select="aandcttables blockdsp dsputil idctdsp mpegvideoenc"
+dvvideo_decoder_select="idctdsp"
 dvvideo_encoder_select="dsputil"
 dxa_decoder_deps="zlib"
 eac3_decoder_select="ac3_decoder"
 eac3_encoder_select="ac3_encoder"
-eamad_decoder_select="aandcttables blockdsp bswapdsp dsputil mpegvideo"
-eatgq_decoder_select="aandcttables dsputil"
-eatqi_decoder_select="aandcttables blockdsp bswapdsp dsputil mpeg1video_decoder"
+eamad_decoder_select="aandcttables blockdsp bswapdsp idctdsp mpegvideo"
+eatgq_decoder_select="aandcttables idctdsp"
+eatqi_decoder_select="aandcttables blockdsp bswapdsp idctdsp mpeg1video_decoder"
 exr_decoder_deps="zlib"
 ffv1_decoder_select="golomb rangecoder"
 ffv1_encoder_select="rangecoder"
@@ -1770,7 +1772,7 @@ flv_encoder_select="h263_encoder"
 fourxm_decoder_select="blockdsp bswapdsp"
 fraps_decoder_select="bswapdsp huffman"
 g2m_decoder_deps="zlib"
-g2m_decoder_select="blockdsp dsputil"
+g2m_decoder_select="blockdsp idctdsp"
 h261_decoder_select="mpeg_er mpegvideo"
 h261_encoder_select="aandcttables mpegvideoenc"
 h263_decoder_select="error_resilience h263_parser h263dsp mpeg_er mpegvideo qpeldsp"
@@ -1790,12 +1792,12 @@ jpegls_decoder_select="golomb mjpeg_decoder"
 jpegls_encoder_select="golomb"
 jv_decoder_select="blockdsp"
 lagarith_decoder_select="huffyuvdsp"
-ljpeg_encoder_select="aandcttables dsputil"
+ljpeg_encoder_select="aandcttables idctdsp"
 loco_decoder_select="golomb"
-mdec_decoder_select="blockdsp dsputil mpegvideo"
+mdec_decoder_select="blockdsp idctdsp mpegvideo"
 metasound_decoder_select="lsp mdct sinewin"
-mimic_decoder_select="blockdsp bswapdsp dsputil hpeldsp"
-mjpeg_decoder_select="blockdsp dsputil hpeldsp"
+mimic_decoder_select="blockdsp bswapdsp hpeldsp idctdsp"
+mjpeg_decoder_select="blockdsp hpeldsp idctdsp"
 mjpeg_encoder_select="aandcttables mpegvideoenc"
 mjpegb_decoder_select="mjpeg_decoder"
 mlp_decoder_select="mlp_parser"
@@ -1829,13 +1831,13 @@ mss2_decoder_select="error_resilience mpeg_er qpeldsp vc1_decoder"
 mxpeg_decoder_select="mjpeg_decoder"
 nellymoser_decoder_select="mdct sinewin"
 nellymoser_encoder_select="audio_frame_queue mdct sinewin"
-nuv_decoder_select="dsputil lzo"
+nuv_decoder_select="idctdsp lzo"
 on2avc_decoder_select="mdct"
 opus_decoder_deps="avresample"
 png_decoder_deps="zlib"
 png_encoder_deps="zlib"
 png_encoder_select="huffyuvencdsp"
-prores_decoder_select="dsputil"
+prores_decoder_select="idctdsp"
 prores_encoder_select="dsputil"
 qcelp_decoder_select="lsp"
 qdm2_decoder_select="mdct rdft mpegaudiodsp"
@@ -1888,7 +1890,7 @@ wmav2_encoder_select="mdct sinewin"
 wmavoice_decoder_select="lsp rdft dct mdct sinewin"
 wmv1_decoder_select="h263_decoder"
 wmv1_encoder_select="h263_encoder"
-wmv2_decoder_select="blockdsp h263_decoder intrax8 videodsp"
+wmv2_decoder_select="blockdsp h263_decoder idctdsp intrax8 videodsp"
 wmv2_encoder_select="h263_encoder"
 wmv3_decoder_select="vc1_decoder"
 wmv3image_decoder_select="wmv3_decoder"
diff --git a/doc/optimization.txt b/doc/optimization.txt
index b51183fa34..b3dca645a8 100644
--- a/doc/optimization.txt
+++ b/doc/optimization.txt
@@ -136,9 +136,6 @@ dct_unquantize_mpeg2
 dct_unquantize_h263
     Used in MPEG-4/H.263 en/decoding.
 
-FIXME remaining functions?
-BTW, most of these functions are in dsputil.c/.h, some are in mpegvideo.c/.h.
-
 
 
 Alignment:
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index bfe50f352e..dc374cb605 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -33,9 +33,8 @@ OBJS-$(CONFIG_BLOCKDSP)                += blockdsp.o
 OBJS-$(CONFIG_BSWAPDSP)                += bswapdsp.o
 OBJS-$(CONFIG_CABAC)                   += cabac.o
 OBJS-$(CONFIG_DCT)                     += dct.o dct32_fixed.o dct32_float.o
+OBJS-$(CONFIG_DSPUTIL)                 += dsputil.o
 OBJS-$(CONFIG_DXVA2)                   += dxva2.o
-OBJS-$(CONFIG_DSPUTIL)                 += dsputil.o faanidct.o          \
-                                          simple_idct.o jrevdct.o
 OBJS-$(CONFIG_ENCODERS)                += faandct.o jfdctfst.o jfdctint.o
 OBJS-$(CONFIG_ERROR_RESILIENCE)        += error_resilience.o
 FFT-OBJS-$(CONFIG_HARDCODED_TABLES)    += cos_tables.o cos_fixed_tables.o
@@ -51,6 +50,8 @@ OBJS-$(CONFIG_HPELDSP)                 += hpeldsp.o
 OBJS-$(CONFIG_HUFFMAN)                 += huffman.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += huffyuvdsp.o
 OBJS-$(CONFIG_HUFFYUVENCDSP)           += huffyuvencdsp.o
+OBJS-$(CONFIG_IDCTDSP)                 += idctdsp.o faanidct.o          \
+                                          simple_idct.o jrevdct.o
 OBJS-$(CONFIG_INTRAX8)                 += intrax8.o intrax8dsp.o
 OBJS-$(CONFIG_LIBXVID)                 += libxvid_rc.o
 OBJS-$(CONFIG_LPC)                     += lpc.o
diff --git a/libavcodec/aic.c b/libavcodec/aic.c
index 68ae728763..dac9d8b7fd 100644
--- a/libavcodec/aic.c
+++ b/libavcodec/aic.c
@@ -24,10 +24,10 @@
 
 #include "avcodec.h"
 #include "bytestream.h"
-#include "dsputil.h"
 #include "internal.h"
 #include "get_bits.h"
 #include "golomb.h"
+#include "idctdsp.h"
 #include "unary.h"
 
 #define AIC_HDR_SIZE    24
@@ -139,7 +139,7 @@ static const uint8_t *aic_scan[NUM_BANDS] = {
 typedef struct AICContext {
     AVCodecContext *avctx;
     AVFrame        *frame;
-    DSPContext     dsp;
+    IDCTDSPContext idsp;
     ScanTable      scantable;
 
     int            num_x_slices;
@@ -336,16 +336,15 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y,
                 recombine_block_il(ctx->block, ctx->scantable.permutated,
                                    &base_y, &ext_y, blk);
             unquant_block(ctx->block, ctx->quant);
-            ctx->dsp.idct(ctx->block);
+            ctx->idsp.idct(ctx->block);
 
             if (!ctx->interlaced) {
                 dst = Y + (blk >> 1) * 8 * ystride + (blk & 1) * 8;
-                ctx->dsp.put_signed_pixels_clamped(ctx->block, dst,
-                                                   ystride);
+                ctx->idsp.put_signed_pixels_clamped(ctx->block, dst, ystride);
             } else {
                 dst = Y + (blk & 1) * 8 + (blk >> 1) * ystride;
-                ctx->dsp.put_signed_pixels_clamped(ctx->block, dst,
-                                                   ystride * 2);
+                ctx->idsp.put_signed_pixels_clamped(ctx->block, dst,
+                                                    ystride * 2);
             }
         }
         Y += 16;
@@ -354,9 +353,9 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y,
             recombine_block(ctx->block, ctx->scantable.permutated,
                             &base_c, &ext_c);
             unquant_block(ctx->block, ctx->quant);
-            ctx->dsp.idct(ctx->block);
-            ctx->dsp.put_signed_pixels_clamped(ctx->block, C[blk],
-                                               ctx->frame->linesize[blk + 1]);
+            ctx->idsp.idct(ctx->block);
+            ctx->idsp.put_signed_pixels_clamped(ctx->block, C[blk],
+                                                ctx->frame->linesize[blk + 1]);
             C[blk] += 8;
         }
     }
@@ -426,11 +425,11 @@ static av_cold int aic_decode_init(AVCodecContext *avctx)
 
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
-    ff_dsputil_init(&ctx->dsp, avctx);
+    ff_idctdsp_init(&ctx->idsp, avctx);
 
     for (i = 0; i < 64; i++)
         scan[i] = i;
-    ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable, scan);
+    ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable, scan);
 
     ctx->mb_width  = FFALIGN(avctx->width,  16) >> 4;
     ctx->mb_height = FFALIGN(avctx->height, 16) >> 4;
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index eb92a8c953..3a3e244c4d 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -6,10 +6,7 @@ OBJS-$(CONFIG_AC3DSP)                  += arm/ac3dsp_init_arm.o         \
                                           arm/ac3dsp_arm.o
 OBJS-$(CONFIG_AUDIODSP)                += arm/audiodsp_init_arm.o
 OBJS-$(CONFIG_BLOCKDSP)                += arm/blockdsp_init_arm.o
-OBJS-$(CONFIG_DSPUTIL)                 += arm/dsputil_init_arm.o        \
-                                          arm/dsputil_arm.o             \
-                                          arm/jrevdct_arm.o             \
-                                          arm/simple_idct_arm.o
+OBJS-$(CONFIG_DSPUTIL)                 += arm/dsputil_init_arm.o
 OBJS-$(CONFIG_FFT)                     += arm/fft_init_arm.o            \
                                           arm/fft_fixed_init_arm.o
 OBJS-$(CONFIG_H264CHROMA)              += arm/h264chroma_init_arm.o
@@ -18,6 +15,10 @@ OBJS-$(CONFIG_H264PRED)                += arm/h264pred_init_arm.o
 OBJS-$(CONFIG_H264QPEL)                += arm/h264qpel_init_arm.o
 OBJS-$(CONFIG_HPELDSP)                 += arm/hpeldsp_init_arm.o        \
                                           arm/hpeldsp_arm.o
+OBJS-$(CONFIG_IDCTDSP)                 += arm/idctdsp_init_arm.o        \
+                                          arm/idctdsp_arm.o             \
+                                          arm/jrevdct_arm.o             \
+                                          arm/simple_idct_arm.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)       += arm/neontest.o
@@ -40,7 +41,7 @@ OBJS-$(CONFIG_RV30_DECODER)            += arm/rv34dsp_init_arm.o
 OBJS-$(CONFIG_RV40_DECODER)            += arm/rv34dsp_init_arm.o        \
                                           arm/rv40dsp_init_arm.o
 
-ARMV5TE-OBJS-$(CONFIG_DSPUTIL)         += arm/dsputil_init_armv5te.o    \
+ARMV5TE-OBJS-$(CONFIG_IDCTDSP)         += arm/idctdsp_init_armv5te.o    \
                                           arm/simple_idct_armv5te.o
 ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO)       += arm/mpegvideo_armv5te.o       \
                                           arm/mpegvideo_armv5te_s.o
@@ -51,11 +52,13 @@ ARMV5TE-OBJS-$(CONFIG_MLP_DECODER)     += arm/mlpdsp_armv5te.o
 
 ARMV6-OBJS-$(CONFIG_AC3DSP)            += arm/ac3dsp_armv6.o
 ARMV6-OBJS-$(CONFIG_DSPUTIL)           += arm/dsputil_init_armv6.o      \
-                                          arm/dsputil_armv6.o           \
-                                          arm/simple_idct_armv6.o
+                                          arm/dsputil_armv6.o
 ARMV6-OBJS-$(CONFIG_H264DSP)           += arm/h264dsp_armv6.o
 ARMV6-OBJS-$(CONFIG_HPELDSP)           += arm/hpeldsp_init_armv6.o      \
                                           arm/hpeldsp_armv6.o
+ARMV6-OBJS-$(CONFIG_IDCTDSP)           += arm/idctdsp_init_armv6.o      \
+                                          arm/idctdsp_armv6.o           \
+                                          arm/simple_idct_armv6.o
 ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
 
 ARMV6-OBJS-$(CONFIG_MLP_DECODER)       += arm/mlpdsp_armv6.o
@@ -83,9 +86,6 @@ NEON-OBJS-$(CONFIG_AUDIODSP)           += arm/audiodsp_init_neon.o      \
                                           arm/int_neon.o
 NEON-OBJS-$(CONFIG_BLOCKDSP)           += arm/blockdsp_init_neon.o      \
                                           arm/blockdsp_neon.o
-NEON-OBJS-$(CONFIG_DSPUTIL)            += arm/dsputil_init_neon.o       \
-                                          arm/dsputil_neon.o            \
-                                          arm/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_FFT)                += arm/fft_neon.o                \
                                           arm/fft_fixed_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)         += arm/h264cmc_neon.o
@@ -96,6 +96,9 @@ NEON-OBJS-$(CONFIG_H264QPEL)           += arm/h264qpel_neon.o           \
                                           arm/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_HPELDSP)            += arm/hpeldsp_init_neon.o       \
                                           arm/hpeldsp_neon.o
+NEON-OBJS-$(CONFIG_IDCTDSP)            += arm/idctdsp_init_neon.o       \
+                                          arm/idctdsp_neon.o            \
+                                          arm/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)               += arm/mdct_neon.o               \
                                           arm/mdct_fixed_neon.o
 NEON-OBJS-$(CONFIG_MPEGVIDEO)          += arm/mpegvideo_neon.o
diff --git a/libavcodec/arm/dsputil_arm.S b/libavcodec/arm/dsputil_arm.S
deleted file mode 100644
index 82fcf2ae91..0000000000
--- a/libavcodec/arm/dsputil_arm.S
+++ /dev/null
@@ -1,120 +0,0 @@
-@
-@ ARMv4 optimized DSP utils
-@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
-@
-@ This file is part of Libav.
-@
-@ Libav is free software; you can redistribute it and/or
-@ modify it under the terms of the GNU Lesser General Public
-@ License as published by the Free Software Foundation; either
-@ version 2.1 of the License, or (at your option) any later version.
-@
-@ Libav is distributed in the hope that it will be useful,
-@ but WITHOUT ANY WARRANTY; without even the implied warranty of
-@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-@ Lesser General Public License for more details.
-@
-@ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
-@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-@
-
-#include "config.h"
-#include "libavutil/arm/asm.S"
-
-@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
-function ff_add_pixels_clamped_arm, export=1, align=5
-        push            {r4-r10}
-        mov             r10, #8
-1:
-        ldr             r4,  [r1]               /* load dest */
-        /* block[0] and block[1]*/
-        ldrsh           r5,  [r0]
-        ldrsh           r7,  [r0, #2]
-        and             r6,  r4,  #0xFF
-        and             r8,  r4,  #0xFF00
-        add             r6,  r6,  r5
-        add             r8,  r7,  r8,  lsr #8
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        it              ne
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        it              ne
-        movne           r8,  r7,  lsr #24
-        mov             r9,  r6
-        ldrsh           r5,  [r0, #4]           /* moved form [A] */
-        orr             r9,  r9,  r8,  lsl #8
-        /* block[2] and block[3] */
-        /* [A] */
-        ldrsh           r7,  [r0, #6]
-        and             r6,  r4,  #0xFF0000
-        and             r8,  r4,  #0xFF000000
-        add             r6,  r5,  r6,  lsr #16
-        add             r8,  r7,  r8,  lsr #24
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        it              ne
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        it              ne
-        movne           r8,  r7,  lsr #24
-        orr             r9,  r9,  r6,  lsl #16
-        ldr             r4,  [r1, #4]           /* moved form [B] */
-        orr             r9,  r9,  r8,  lsl #24
-        /* store dest */
-        ldrsh           r5,  [r0, #8]           /* moved form [C] */
-        str             r9,  [r1]
-
-        /* load dest */
-        /* [B] */
-        /* block[4] and block[5] */
-        /* [C] */
-        ldrsh           r7,  [r0, #10]
-        and             r6,  r4,  #0xFF
-        and             r8,  r4,  #0xFF00
-        add             r6,  r6,  r5
-        add             r8,  r7,  r8,  lsr #8
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        it              ne
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        it              ne
-        movne           r8,  r7,  lsr #24
-        mov             r9,  r6
-        ldrsh           r5,  [r0, #12]          /* moved from [D] */
-        orr             r9,  r9,  r8,  lsl #8
-        /* block[6] and block[7] */
-        /* [D] */
-        ldrsh           r7,  [r0, #14]
-        and             r6,  r4,  #0xFF0000
-        and             r8,  r4,  #0xFF000000
-        add             r6,  r5,  r6,  lsr #16
-        add             r8,  r7,  r8,  lsr #24
-        mvn             r5,  r5
-        mvn             r7,  r7
-        tst             r6,  #0x100
-        it              ne
-        movne           r6,  r5,  lsr #24
-        tst             r8,  #0x100
-        it              ne
-        movne           r8,  r7,  lsr #24
-        orr             r9,  r9,  r6,  lsl #16
-        add             r0,  r0,  #16           /* moved from [E] */
-        orr             r9,  r9,  r8,  lsl #24
-        subs            r10, r10, #1            /* moved from [F] */
-        /* store dest */
-        str             r9,  [r1, #4]
-
-        /* [E] */
-        /* [F] */
-        add             r1,  r1,  r2
-        bne             1b
-
-        pop             {r4-r10}
-        bx              lr
-endfunc
diff --git a/libavcodec/arm/dsputil_arm.h b/libavcodec/arm/dsputil_arm.h
index 6080203960..5b976aa3d6 100644
--- a/libavcodec/arm/dsputil_arm.h
+++ b/libavcodec/arm/dsputil_arm.h
@@ -24,11 +24,7 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
 
-void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx,
-                             unsigned high_bit_depth);
 void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx,
                            unsigned high_bit_depth);
-void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
-                          unsigned high_bit_depth);
 
 #endif /* AVCODEC_ARM_DSPUTIL_ARM_H */
diff --git a/libavcodec/arm/dsputil_armv6.S b/libavcodec/arm/dsputil_armv6.S
index e667a47f94..b89171ff94 100644
--- a/libavcodec/arm/dsputil_armv6.S
+++ b/libavcodec/arm/dsputil_armv6.S
@@ -20,33 +20,6 @@
 
 #include "libavutil/arm/asm.S"
 
-function ff_add_pixels_clamped_armv6, export=1
-        push            {r4-r8,lr}
-        mov             r3,  #8
-1:
-        ldm             r0!, {r4,r5,r12,lr}
-        ldrd            r6,  r7,  [r1]
-        pkhbt           r8,  r4,  r5,  lsl #16
-        pkhtb           r5,  r5,  r4,  asr #16
-        pkhbt           r4,  r12, lr,  lsl #16
-        pkhtb           lr,  lr,  r12, asr #16
-        pld             [r1, r2]
-        uxtab16         r8,  r8,  r6
-        uxtab16         r5,  r5,  r6,  ror #8
-        uxtab16         r4,  r4,  r7
-        uxtab16         lr,  lr,  r7,  ror #8
-        usat16          r8,  #8,  r8
-        usat16          r5,  #8,  r5
-        usat16          r4,  #8,  r4
-        usat16          lr,  #8,  lr
-        orr             r6,  r8,  r5,  lsl #8
-        orr             r7,  r4,  lr,  lsl #8
-        subs            r3,  r3,  #1
-        strd_post       r6,  r7,  r1,  r2
-        bgt             1b
-        pop             {r4-r8,pc}
-endfunc
-
 function ff_get_pixels_armv6, export=1
         pld             [r1, r2]
         push            {r4-r8, lr}
diff --git a/libavcodec/arm/dsputil_init_arm.c b/libavcodec/arm/dsputil_init_arm.c
index 33109088ee..a8c806a47f 100644
--- a/libavcodec/arm/dsputil_init_arm.c
+++ b/libavcodec/arm/dsputil_init_arm.c
@@ -28,71 +28,11 @@
 #include "libavcodec/dsputil.h"
 #include "dsputil_arm.h"
 
-void ff_j_rev_dct_arm(int16_t *data);
-void ff_simple_idct_arm(int16_t *data);
-
-/* XXX: local hack */
-static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
-static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
-
-void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
-                               int line_size);
-
-/* XXX: those functions should be suppressed ASAP when all IDCTs are
- * converted */
-static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_j_rev_dct_arm(block);
-    ff_put_pixels_clamped(block, dest, line_size);
-}
-
-static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_j_rev_dct_arm(block);
-    ff_add_pixels_clamped(block, dest, line_size);
-}
-
-static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_simple_idct_arm(block);
-    ff_put_pixels_clamped(block, dest, line_size);
-}
-
-static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_simple_idct_arm(block);
-    ff_add_pixels_clamped(block, dest, line_size);
-}
-
 av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    ff_put_pixels_clamped = c->put_pixels_clamped;
-    ff_add_pixels_clamped = c->add_pixels_clamped;
-
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
-            avctx->idct_algo == FF_IDCT_ARM) {
-            c->idct_put              = j_rev_dct_arm_put;
-            c->idct_add              = j_rev_dct_arm_add;
-            c->idct                  = ff_j_rev_dct_arm;
-            c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
-        } else if (avctx->idct_algo == FF_IDCT_SIMPLEARM) {
-            c->idct_put              = simple_idct_arm_put;
-            c->idct_add              = simple_idct_arm_add;
-            c->idct                  = ff_simple_idct_arm;
-            c->idct_permutation_type = FF_NO_IDCT_PERM;
-        }
-    }
-
-    c->add_pixels_clamped = ff_add_pixels_clamped_arm;
-
-    if (have_armv5te(cpu_flags))
-        ff_dsputil_init_armv5te(c, avctx, high_bit_depth);
     if (have_armv6(cpu_flags))
         ff_dsputil_init_armv6(c, avctx, high_bit_depth);
-    if (have_neon(cpu_flags))
-        ff_dsputil_init_neon(c, avctx, high_bit_depth);
 }
diff --git a/libavcodec/arm/dsputil_init_armv5te.c b/libavcodec/arm/dsputil_init_armv5te.c
deleted file mode 100644
index eb45b72088..0000000000
--- a/libavcodec/arm/dsputil_init_armv5te.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-#include "dsputil_arm.h"
-
-void ff_simple_idct_armv5te(int16_t *data);
-void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data);
-void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data);
-
-av_cold void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx,
-                                     unsigned high_bit_depth)
-{
-    if (!high_bit_depth &&
-        (avctx->idct_algo == FF_IDCT_AUTO ||
-         avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
-        c->idct_put              = ff_simple_idct_put_armv5te;
-        c->idct_add              = ff_simple_idct_add_armv5te;
-        c->idct                  = ff_simple_idct_armv5te;
-        c->idct_permutation_type = FF_NO_IDCT_PERM;
-    }
-}
diff --git a/libavcodec/arm/dsputil_init_armv6.c b/libavcodec/arm/dsputil_init_armv6.c
index 2b1002bfda..fab5e0d232 100644
--- a/libavcodec/arm/dsputil_init_armv6.c
+++ b/libavcodec/arm/dsputil_init_armv6.c
@@ -26,13 +26,6 @@
 #include "libavcodec/mpegvideo.h"
 #include "dsputil_arm.h"
 
-void ff_simple_idct_armv6(int16_t *data);
-void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data);
-void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data);
-
-void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels,
-                                 int line_size);
-
 void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride);
 void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
                           const uint8_t *s2, int stride);
@@ -56,17 +49,6 @@ int ff_pix_sum_armv6(uint8_t *pix, int line_size);
 av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx,
                                    unsigned high_bit_depth)
 {
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
-            avctx->idct_algo == FF_IDCT_SIMPLEARMV6) {
-            c->idct_put              = ff_simple_idct_put_armv6;
-            c->idct_add              = ff_simple_idct_add_armv6;
-            c->idct                  = ff_simple_idct_armv6;
-            c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
-        }
-    }
-    c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
-
     if (!high_bit_depth)
         c->get_pixels = ff_get_pixels_armv6;
     c->diff_pixels = ff_diff_pixels_armv6;
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
deleted file mode 100644
index 9d4c76ce58..0000000000
--- a/libavcodec/arm/dsputil_init_neon.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-#include "dsputil_arm.h"
-
-void ff_simple_idct_neon(int16_t *data);
-void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
-void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
-
-void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-
-av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
-                                  unsigned high_bit_depth)
-{
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
-            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
-            c->idct_put              = ff_simple_idct_put_neon;
-            c->idct_add              = ff_simple_idct_add_neon;
-            c->idct                  = ff_simple_idct_neon;
-            c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
-        }
-    }
-
-    c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
-    c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
-    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
-}
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
deleted file mode 100644
index ed6f218380..0000000000
--- a/libavcodec/arm/dsputil_neon.S
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_put_pixels_clamped_neon, export=1
-        vld1.16         {d16-d19}, [r0,:128]!
-        vqmovun.s16     d0, q8
-        vld1.16         {d20-d23}, [r0,:128]!
-        vqmovun.s16     d1, q9
-        vld1.16         {d24-d27}, [r0,:128]!
-        vqmovun.s16     d2, q10
-        vld1.16         {d28-d31}, [r0,:128]!
-        vqmovun.s16     d3, q11
-        vst1.8          {d0},      [r1,:64], r2
-        vqmovun.s16     d4, q12
-        vst1.8          {d1},      [r1,:64], r2
-        vqmovun.s16     d5, q13
-        vst1.8          {d2},      [r1,:64], r2
-        vqmovun.s16     d6, q14
-        vst1.8          {d3},      [r1,:64], r2
-        vqmovun.s16     d7, q15
-        vst1.8          {d4},      [r1,:64], r2
-        vst1.8          {d5},      [r1,:64], r2
-        vst1.8          {d6},      [r1,:64], r2
-        vst1.8          {d7},      [r1,:64], r2
-        bx              lr
-endfunc
-
-function ff_put_signed_pixels_clamped_neon, export=1
-        vmov.u8         d31, #128
-        vld1.16         {d16-d17}, [r0,:128]!
-        vqmovn.s16      d0, q8
-        vld1.16         {d18-d19}, [r0,:128]!
-        vqmovn.s16      d1, q9
-        vld1.16         {d16-d17}, [r0,:128]!
-        vqmovn.s16      d2, q8
-        vld1.16         {d18-d19}, [r0,:128]!
-        vadd.u8         d0, d0, d31
-        vld1.16         {d20-d21}, [r0,:128]!
-        vadd.u8         d1, d1, d31
-        vld1.16         {d22-d23}, [r0,:128]!
-        vadd.u8         d2, d2, d31
-        vst1.8          {d0},      [r1,:64], r2
-        vqmovn.s16      d3, q9
-        vst1.8          {d1},      [r1,:64], r2
-        vqmovn.s16      d4, q10
-        vst1.8          {d2},      [r1,:64], r2
-        vqmovn.s16      d5, q11
-        vld1.16         {d24-d25}, [r0,:128]!
-        vadd.u8         d3, d3, d31
-        vld1.16         {d26-d27}, [r0,:128]!
-        vadd.u8         d4, d4, d31
-        vadd.u8         d5, d5, d31
-        vst1.8          {d3},      [r1,:64], r2
-        vqmovn.s16      d6, q12
-        vst1.8          {d4},      [r1,:64], r2
-        vqmovn.s16      d7, q13
-        vst1.8          {d5},      [r1,:64], r2
-        vadd.u8         d6, d6, d31
-        vadd.u8         d7, d7, d31
-        vst1.8          {d6},      [r1,:64], r2
-        vst1.8          {d7},      [r1,:64], r2
-        bx              lr
-endfunc
-
-function ff_add_pixels_clamped_neon, export=1
-        mov             r3, r1
-        vld1.8          {d16},   [r1,:64], r2
-        vld1.16         {d0-d1}, [r0,:128]!
-        vaddw.u8        q0, q0, d16
-        vld1.8          {d17},   [r1,:64], r2
-        vld1.16         {d2-d3}, [r0,:128]!
-        vqmovun.s16     d0, q0
-        vld1.8          {d18},   [r1,:64], r2
-        vaddw.u8        q1, q1, d17
-        vld1.16         {d4-d5}, [r0,:128]!
-        vaddw.u8        q2, q2, d18
-        vst1.8          {d0},    [r3,:64], r2
-        vqmovun.s16     d2, q1
-        vld1.8          {d19},   [r1,:64], r2
-        vld1.16         {d6-d7}, [r0,:128]!
-        vaddw.u8        q3, q3, d19
-        vqmovun.s16     d4, q2
-        vst1.8          {d2},    [r3,:64], r2
-        vld1.8          {d16},   [r1,:64], r2
-        vqmovun.s16     d6, q3
-        vld1.16         {d0-d1}, [r0,:128]!
-        vaddw.u8        q0, q0, d16
-        vst1.8          {d4},    [r3,:64], r2
-        vld1.8          {d17},   [r1,:64], r2
-        vld1.16         {d2-d3}, [r0,:128]!
-        vaddw.u8        q1, q1, d17
-        vst1.8          {d6},    [r3,:64], r2
-        vqmovun.s16     d0, q0
-        vld1.8          {d18},   [r1,:64], r2
-        vld1.16         {d4-d5}, [r0,:128]!
-        vaddw.u8        q2, q2, d18
-        vst1.8          {d0},    [r3,:64], r2
-        vqmovun.s16     d2, q1
-        vld1.8          {d19},   [r1,:64], r2
-        vqmovun.s16     d4, q2
-        vld1.16         {d6-d7}, [r0,:128]!
-        vaddw.u8        q3, q3, d19
-        vst1.8          {d2},    [r3,:64], r2
-        vqmovun.s16     d6, q3
-        vst1.8          {d4},    [r3,:64], r2
-        vst1.8          {d6},    [r3,:64], r2
-        bx              lr
-endfunc
diff --git a/libavcodec/arm/idctdsp_arm.S b/libavcodec/arm/idctdsp_arm.S
new file mode 100644
index 0000000000..34f467e86f
--- /dev/null
+++ b/libavcodec/arm/idctdsp_arm.S
@@ -0,0 +1,120 @@
+@
+@ ARMv4-optimized IDCT functions
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
+@
+@ This file is part of Libav.
+@
+@ Libav is free software; you can redistribute it and/or
+@ modify it under the terms of the GNU Lesser General Public
+@ License as published by the Free Software Foundation; either
+@ version 2.1 of the License, or (at your option) any later version.
+@
+@ Libav is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+@ Lesser General Public License for more details.
+@
+@ You should have received a copy of the GNU Lesser General Public
+@ License along with Libav; if not, write to the Free Software
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+@
+
+#include "config.h"
+#include "libavutil/arm/asm.S"
+
+@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
+function ff_add_pixels_clamped_arm, export=1, align=5
+        push            {r4-r10}
+        mov             r10, #8
+1:
+        ldr             r4,  [r1]               /* load dest */
+        /* block[0] and block[1]*/
+        ldrsh           r5,  [r0]
+        ldrsh           r7,  [r0, #2]
+        and             r6,  r4,  #0xFF
+        and             r8,  r4,  #0xFF00
+        add             r6,  r6,  r5
+        add             r8,  r7,  r8,  lsr #8
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        it              ne
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        it              ne
+        movne           r8,  r7,  lsr #24
+        mov             r9,  r6
+        ldrsh           r5,  [r0, #4]           /* moved form [A] */
+        orr             r9,  r9,  r8,  lsl #8
+        /* block[2] and block[3] */
+        /* [A] */
+        ldrsh           r7,  [r0, #6]
+        and             r6,  r4,  #0xFF0000
+        and             r8,  r4,  #0xFF000000
+        add             r6,  r5,  r6,  lsr #16
+        add             r8,  r7,  r8,  lsr #24
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        it              ne
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        it              ne
+        movne           r8,  r7,  lsr #24
+        orr             r9,  r9,  r6,  lsl #16
+        ldr             r4,  [r1, #4]           /* moved form [B] */
+        orr             r9,  r9,  r8,  lsl #24
+        /* store dest */
+        ldrsh           r5,  [r0, #8]           /* moved form [C] */
+        str             r9,  [r1]
+
+        /* load dest */
+        /* [B] */
+        /* block[4] and block[5] */
+        /* [C] */
+        ldrsh           r7,  [r0, #10]
+        and             r6,  r4,  #0xFF
+        and             r8,  r4,  #0xFF00
+        add             r6,  r6,  r5
+        add             r8,  r7,  r8,  lsr #8
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        it              ne
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        it              ne
+        movne           r8,  r7,  lsr #24
+        mov             r9,  r6
+        ldrsh           r5,  [r0, #12]          /* moved from [D] */
+        orr             r9,  r9,  r8,  lsl #8
+        /* block[6] and block[7] */
+        /* [D] */
+        ldrsh           r7,  [r0, #14]
+        and             r6,  r4,  #0xFF0000
+        and             r8,  r4,  #0xFF000000
+        add             r6,  r5,  r6,  lsr #16
+        add             r8,  r7,  r8,  lsr #24
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        it              ne
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        it              ne
+        movne           r8,  r7,  lsr #24
+        orr             r9,  r9,  r6,  lsl #16
+        add             r0,  r0,  #16           /* moved from [E] */
+        orr             r9,  r9,  r8,  lsl #24
+        subs            r10, r10, #1            /* moved from [F] */
+        /* store dest */
+        str             r9,  [r1, #4]
+
+        /* [E] */
+        /* [F] */
+        add             r1,  r1,  r2
+        bne             1b
+
+        pop             {r4-r10}
+        bx              lr
+endfunc
diff --git a/libavcodec/arm/idctdsp_arm.h b/libavcodec/arm/idctdsp_arm.h
new file mode 100644
index 0000000000..9012b82904
--- /dev/null
+++ b/libavcodec/arm/idctdsp_arm.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_IDCTDSP_ARM_H
+#define AVCODEC_ARM_IDCTDSP_ARM_H
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+
+void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
+                             unsigned high_bit_depth);
+void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
+                           unsigned high_bit_depth);
+void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
+                          unsigned high_bit_depth);
+
+#endif /* AVCODEC_ARM_IDCTDSP_ARM_H */
diff --git a/libavcodec/arm/idctdsp_armv6.S b/libavcodec/arm/idctdsp_armv6.S
new file mode 100644
index 0000000000..c180d732fa
--- /dev/null
+++ b/libavcodec/arm/idctdsp_armv6.S
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_add_pixels_clamped_armv6, export=1
+        push            {r4-r8,lr}
+        mov             r3,  #8
+1:
+        ldm             r0!, {r4,r5,r12,lr}
+        ldrd            r6,  r7,  [r1]
+        pkhbt           r8,  r4,  r5,  lsl #16
+        pkhtb           r5,  r5,  r4,  asr #16
+        pkhbt           r4,  r12, lr,  lsl #16
+        pkhtb           lr,  lr,  r12, asr #16
+        pld             [r1, r2]
+        uxtab16         r8,  r8,  r6
+        uxtab16         r5,  r5,  r6,  ror #8
+        uxtab16         r4,  r4,  r7
+        uxtab16         lr,  lr,  r7,  ror #8
+        usat16          r8,  #8,  r8
+        usat16          r5,  #8,  r5
+        usat16          r4,  #8,  r4
+        usat16          lr,  #8,  lr
+        orr             r6,  r8,  r5,  lsl #8
+        orr             r7,  r4,  lr,  lsl #8
+        subs            r3,  r3,  #1
+        strd_post       r6,  r7,  r1,  r2
+        bgt             1b
+        pop             {r4-r8,pc}
+endfunc
diff --git a/libavcodec/arm/idctdsp_init_arm.c b/libavcodec/arm/idctdsp_init_arm.c
new file mode 100644
index 0000000000..b4d189902d
--- /dev/null
+++ b/libavcodec/arm/idctdsp_init_arm.c
@@ -0,0 +1,98 @@
+/*
+ * ARM-optimized IDCT functions
+ * Copyright (c) 2001 Lionel Ulmer
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idctdsp_arm.h"
+
+void ff_j_rev_dct_arm(int16_t *data);
+void ff_simple_idct_arm(int16_t *data);
+
+/* XXX: local hack */
+static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
+static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
+
+void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
+                               int line_size);
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+ * converted */
+static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct_arm(block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+
+static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct_arm(block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+
+static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_simple_idct_arm(block);
+    ff_put_pixels_clamped(block, dest, line_size);
+}
+
+static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_simple_idct_arm(block);
+    ff_add_pixels_clamped(block, dest, line_size);
+}
+
+av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
+                                 unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    ff_put_pixels_clamped = c->put_pixels_clamped;
+    ff_add_pixels_clamped = c->add_pixels_clamped;
+
+    if (!high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_ARM) {
+            c->idct_put              = j_rev_dct_arm_put;
+            c->idct_add              = j_rev_dct_arm_add;
+            c->idct                  = ff_j_rev_dct_arm;
+            c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
+        } else if (avctx->idct_algo == FF_IDCT_SIMPLEARM) {
+            c->idct_put              = simple_idct_arm_put;
+            c->idct_add              = simple_idct_arm_add;
+            c->idct                  = ff_simple_idct_arm;
+            c->idct_permutation_type = FF_NO_IDCT_PERM;
+        }
+    }
+
+    c->add_pixels_clamped = ff_add_pixels_clamped_arm;
+
+    if (have_armv5te(cpu_flags))
+        ff_idctdsp_init_armv5te(c, avctx, high_bit_depth);
+    if (have_armv6(cpu_flags))
+        ff_idctdsp_init_armv6(c, avctx, high_bit_depth);
+    if (have_neon(cpu_flags))
+        ff_idctdsp_init_neon(c, avctx, high_bit_depth);
+}
diff --git a/libavcodec/arm/idctdsp_init_armv5te.c b/libavcodec/arm/idctdsp_init_armv5te.c
new file mode 100644
index 0000000000..e2492a5da7
--- /dev/null
+++ b/libavcodec/arm/idctdsp_init_armv5te.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idctdsp_arm.h"
+
+void ff_simple_idct_armv5te(int16_t *data);
+void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data);
+
+av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    if (!high_bit_depth &&
+        (avctx->idct_algo == FF_IDCT_AUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
+        c->idct_put              = ff_simple_idct_put_armv5te;
+        c->idct_add              = ff_simple_idct_add_armv5te;
+        c->idct                  = ff_simple_idct_armv5te;
+        c->idct_permutation_type = FF_NO_IDCT_PERM;
+    }
+}
diff --git a/libavcodec/arm/idctdsp_init_armv6.c b/libavcodec/arm/idctdsp_init_armv6.c
new file mode 100644
index 0000000000..e92f471220
--- /dev/null
+++ b/libavcodec/arm/idctdsp_init_armv6.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idctdsp_arm.h"
+
+void ff_simple_idct_armv6(int16_t *data);
+void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data);
+
+void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels,
+                                 int line_size);
+
+av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
+                                   unsigned high_bit_depth)
+{
+    if (!high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLEARMV6) {
+            c->idct_put              = ff_simple_idct_put_armv6;
+            c->idct_add              = ff_simple_idct_add_armv6;
+            c->idct                  = ff_simple_idct_armv6;
+            c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
+        }
+    }
+    c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
+}
diff --git a/libavcodec/arm/idctdsp_init_neon.c b/libavcodec/arm/idctdsp_init_neon.c
new file mode 100644
index 0000000000..17905973fb
--- /dev/null
+++ b/libavcodec/arm/idctdsp_init_neon.c
@@ -0,0 +1,53 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idctdsp_arm.h"
+
+void ff_simple_idct_neon(int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
+
+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+
+av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
+                                  unsigned high_bit_depth)
+{
+    if (!high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+            c->idct_put              = ff_simple_idct_put_neon;
+            c->idct_add              = ff_simple_idct_add_neon;
+            c->idct                  = ff_simple_idct_neon;
+            c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
+        }
+    }
+
+    c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
+    c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
+    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+}
diff --git a/libavcodec/arm/idctdsp_neon.S b/libavcodec/arm/idctdsp_neon.S
new file mode 100644
index 0000000000..7095879bae
--- /dev/null
+++ b/libavcodec/arm/idctdsp_neon.S
@@ -0,0 +1,128 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_put_pixels_clamped_neon, export=1
+        vld1.16         {d16-d19}, [r0,:128]!
+        vqmovun.s16     d0, q8
+        vld1.16         {d20-d23}, [r0,:128]!
+        vqmovun.s16     d1, q9
+        vld1.16         {d24-d27}, [r0,:128]!
+        vqmovun.s16     d2, q10
+        vld1.16         {d28-d31}, [r0,:128]!
+        vqmovun.s16     d3, q11
+        vst1.8          {d0},      [r1,:64], r2
+        vqmovun.s16     d4, q12
+        vst1.8          {d1},      [r1,:64], r2
+        vqmovun.s16     d5, q13
+        vst1.8          {d2},      [r1,:64], r2
+        vqmovun.s16     d6, q14
+        vst1.8          {d3},      [r1,:64], r2
+        vqmovun.s16     d7, q15
+        vst1.8          {d4},      [r1,:64], r2
+        vst1.8          {d5},      [r1,:64], r2
+        vst1.8          {d6},      [r1,:64], r2
+        vst1.8          {d7},      [r1,:64], r2
+        bx              lr
+endfunc
+
+function ff_put_signed_pixels_clamped_neon, export=1
+        vmov.u8         d31, #128
+        vld1.16         {d16-d17}, [r0,:128]!
+        vqmovn.s16      d0, q8
+        vld1.16         {d18-d19}, [r0,:128]!
+        vqmovn.s16      d1, q9
+        vld1.16         {d16-d17}, [r0,:128]!
+        vqmovn.s16      d2, q8
+        vld1.16         {d18-d19}, [r0,:128]!
+        vadd.u8         d0, d0, d31
+        vld1.16         {d20-d21}, [r0,:128]!
+        vadd.u8         d1, d1, d31
+        vld1.16         {d22-d23}, [r0,:128]!
+        vadd.u8         d2, d2, d31
+        vst1.8          {d0},      [r1,:64], r2
+        vqmovn.s16      d3, q9
+        vst1.8          {d1},      [r1,:64], r2
+        vqmovn.s16      d4, q10
+        vst1.8          {d2},      [r1,:64], r2
+        vqmovn.s16      d5, q11
+        vld1.16         {d24-d25}, [r0,:128]!
+        vadd.u8         d3, d3, d31
+        vld1.16         {d26-d27}, [r0,:128]!
+        vadd.u8         d4, d4, d31
+        vadd.u8         d5, d5, d31
+        vst1.8          {d3},      [r1,:64], r2
+        vqmovn.s16      d6, q12
+        vst1.8          {d4},      [r1,:64], r2
+        vqmovn.s16      d7, q13
+        vst1.8          {d5},      [r1,:64], r2
+        vadd.u8         d6, d6, d31
+        vadd.u8         d7, d7, d31
+        vst1.8          {d6},      [r1,:64], r2
+        vst1.8          {d7},      [r1,:64], r2
+        bx              lr
+endfunc
+
+function ff_add_pixels_clamped_neon, export=1
+        mov             r3, r1
+        vld1.8          {d16},   [r1,:64], r2
+        vld1.16         {d0-d1}, [r0,:128]!
+        vaddw.u8        q0, q0, d16
+        vld1.8          {d17},   [r1,:64], r2
+        vld1.16         {d2-d3}, [r0,:128]!
+        vqmovun.s16     d0, q0
+        vld1.8          {d18},   [r1,:64], r2
+        vaddw.u8        q1, q1, d17
+        vld1.16         {d4-d5}, [r0,:128]!
+        vaddw.u8        q2, q2, d18
+        vst1.8          {d0},    [r3,:64], r2
+        vqmovun.s16     d2, q1
+        vld1.8          {d19},   [r1,:64], r2
+        vld1.16         {d6-d7}, [r0,:128]!
+        vaddw.u8        q3, q3, d19
+        vqmovun.s16     d4, q2
+        vst1.8          {d2},    [r3,:64], r2
+        vld1.8          {d16},   [r1,:64], r2
+        vqmovun.s16     d6, q3
+        vld1.16         {d0-d1}, [r0,:128]!
+        vaddw.u8        q0, q0, d16
+        vst1.8          {d4},    [r3,:64], r2
+        vld1.8          {d17},   [r1,:64], r2
+        vld1.16         {d2-d3}, [r0,:128]!
+        vaddw.u8        q1, q1, d17
+        vst1.8          {d6},    [r3,:64], r2
+        vqmovun.s16     d0, q0
+        vld1.8          {d18},   [r1,:64], r2
+        vld1.16         {d4-d5}, [r0,:128]!
+        vaddw.u8        q2, q2, d18
+        vst1.8          {d0},    [r3,:64], r2
+        vqmovun.s16     d2, q1
+        vld1.8          {d19},   [r1,:64], r2
+        vqmovun.s16     d4, q2
+        vld1.16         {d6-d7}, [r0,:128]!
+        vaddw.u8        q3, q3, d19
+        vst1.8          {d2},    [r3,:64], r2
+        vqmovun.s16     d6, q3
+        vst1.8          {d4},    [r3,:64], r2
+        vst1.8          {d6},    [r3,:64], r2
+        bx              lr
+endfunc
diff --git a/libavcodec/asv.c b/libavcodec/asv.c
index dba9e840c7..71c5e5f5b8 100644
--- a/libavcodec/asv.c
+++ b/libavcodec/asv.c
@@ -84,7 +84,6 @@ av_cold void ff_asv_common_init(AVCodecContext *avctx) {
     ASV1Context * const a = avctx->priv_data;
 
     ff_bswapdsp_init(&a->bbdsp);
-    ff_dsputil_init(&a->dsp, avctx);
 
     a->mb_width   = (avctx->width  + 15) / 16;
     a->mb_height  = (avctx->height + 15) / 16;
diff --git a/libavcodec/asv.h b/libavcodec/asv.h
index 037e646969..3f8d56cf8b 100644
--- a/libavcodec/asv.h
+++ b/libavcodec/asv.h
@@ -34,6 +34,7 @@
 #include "blockdsp.h"
 #include "bswapdsp.h"
 #include "dsputil.h"
+#include "idctdsp.h"
 #include "get_bits.h"
 #include "put_bits.h"
 
@@ -42,6 +43,7 @@ typedef struct ASV1Context{
     BlockDSPContext bdsp;
     BswapDSPContext bbdsp;
     DSPContext dsp;
+    IDCTDSPContext idsp;
     PutBitContext pb;
     GetBitContext gb;
     ScanTable scantable;
diff --git a/libavcodec/asvdec.c b/libavcodec/asvdec.c
index c785d151ec..252f88ab6e 100644
--- a/libavcodec/asvdec.c
+++ b/libavcodec/asvdec.c
@@ -30,6 +30,7 @@
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "put_bits.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "mathops.h"
 #include "mpeg12data.h"
@@ -190,14 +191,14 @@ static inline void idct_put(ASV1Context *a, AVFrame *frame, int mb_x, int mb_y)
     uint8_t *dest_cb = frame->data[1] + (mb_y * 8 * frame->linesize[1]) + mb_x * 8;
     uint8_t *dest_cr = frame->data[2] + (mb_y * 8 * frame->linesize[2]) + mb_x * 8;
 
-    a->dsp.idct_put(dest_y                 , linesize, block[0]);
-    a->dsp.idct_put(dest_y              + 8, linesize, block[1]);
-    a->dsp.idct_put(dest_y + 8*linesize    , linesize, block[2]);
-    a->dsp.idct_put(dest_y + 8*linesize + 8, linesize, block[3]);
+    a->idsp.idct_put(dest_y,                    linesize, block[0]);
+    a->idsp.idct_put(dest_y + 8,                linesize, block[1]);
+    a->idsp.idct_put(dest_y + 8 * linesize,     linesize, block[2]);
+    a->idsp.idct_put(dest_y + 8 * linesize + 8, linesize, block[3]);
 
     if (!(a->avctx->flags&CODEC_FLAG_GRAY)) {
-        a->dsp.idct_put(dest_cb, frame->linesize[1], block[4]);
-        a->dsp.idct_put(dest_cr, frame->linesize[2], block[5]);
+        a->idsp.idct_put(dest_cb, frame->linesize[1], block[4]);
+        a->idsp.idct_put(dest_cr, frame->linesize[2], block[5]);
     }
 }
 
@@ -283,8 +284,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
 
     ff_asv_common_init(avctx);
     ff_blockdsp_init(&a->bdsp, avctx);
+    ff_idctdsp_init(&a->idsp, avctx);
     init_vlcs(a);
-    ff_init_scantable(a->dsp.idct_permutation, &a->scantable, ff_asv_scantab);
+    ff_init_scantable(a->idsp.idct_permutation, &a->scantable, ff_asv_scantab);
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
 
     a->inv_qscale = avctx->extradata[0];
diff --git a/libavcodec/asvenc.c b/libavcodec/asvenc.c
index 47b766ac9e..e8c6d00197 100644
--- a/libavcodec/asvenc.c
+++ b/libavcodec/asvenc.c
@@ -247,6 +247,7 @@ static av_cold int encode_init(AVCodecContext *avctx){
     avctx->coded_frame->key_frame = 1;
 
     ff_asv_common_init(avctx);
+    ff_dsputil_init(&a->dsp, avctx);
 
     if(avctx->global_quality == 0) avctx->global_quality= 4*FF_QUALITY_SCALE;
 
diff --git a/libavcodec/cavs.c b/libavcodec/cavs.c
index 21bc1edc23..2be50a7c4a 100644
--- a/libavcodec/cavs.c
+++ b/libavcodec/cavs.c
@@ -29,6 +29,7 @@
 #include "get_bits.h"
 #include "golomb.h"
 #include "h264chroma.h"
+#include "idctdsp.h"
 #include "mathops.h"
 #include "qpeldsp.h"
 #include "cavs.h"
@@ -760,13 +761,13 @@ av_cold int ff_cavs_init(AVCodecContext *avctx)
     AVSContext *h = avctx->priv_data;
 
     ff_blockdsp_init(&h->bdsp, avctx);
-    ff_dsputil_init(&h->dsp, avctx);
     ff_h264chroma_init(&h->h264chroma, 8);
+    ff_idctdsp_init(&h->idsp, avctx);
     ff_videodsp_init(&h->vdsp, 8);
     ff_cavsdsp_init(&h->cdsp, avctx);
-    ff_init_scantable_permutation(h->dsp.idct_permutation,
+    ff_init_scantable_permutation(h->idsp.idct_permutation,
                                   h->cdsp.idct_perm);
-    ff_init_scantable(h->dsp.idct_permutation, &h->scantable, ff_zigzag_direct);
+    ff_init_scantable(h->idsp.idct_permutation, &h->scantable, ff_zigzag_direct);
 
     h->avctx       = avctx;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
diff --git a/libavcodec/cavs.h b/libavcodec/cavs.h
index c5a10b556b..cfae05576b 100644
--- a/libavcodec/cavs.h
+++ b/libavcodec/cavs.h
@@ -24,8 +24,8 @@
 
 #include "cavsdsp.h"
 #include "blockdsp.h"
-#include "dsputil.h"
 #include "h264chroma.h"
+#include "idctdsp.h"
 #include "get_bits.h"
 #include "videodsp.h"
 
@@ -162,9 +162,9 @@ typedef struct AVSFrame {
 
 typedef struct AVSContext {
     AVCodecContext *avctx;
-    DSPContext       dsp;
     BlockDSPContext bdsp;
     H264ChromaContext h264chroma;
+    IDCTDSPContext idsp;
     VideoDSPContext vdsp;
     CAVSDSPContext  cdsp;
     GetBitContext gb;
diff --git a/libavcodec/cavsdsp.c b/libavcodec/cavsdsp.c
index 666dc7f03f..958e3c5ef2 100644
--- a/libavcodec/cavsdsp.c
+++ b/libavcodec/cavsdsp.c
@@ -24,7 +24,7 @@
 
 #include <stdio.h>
 
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "mathops.h"
 #include "cavsdsp.h"
 #include "libavutil/common.h"
diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c
index 3bd8ffecd6..ca67990156 100644
--- a/libavcodec/dnxhddec.c
+++ b/libavcodec/dnxhddec.c
@@ -28,7 +28,7 @@
 #include "blockdsp.h"
 #include "get_bits.h"
 #include "dnxhddata.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "internal.h"
 
 typedef struct DNXHDContext {
@@ -42,7 +42,7 @@ typedef struct DNXHDContext {
     int cur_field;                      ///< current interlaced field
     VLC ac_vlc, dc_vlc, run_vlc;
     int last_dc[3];
-    DSPContext dsp;
+    IDCTDSPContext idsp;
     DECLARE_ALIGNED(16, int16_t, blocks)[12][64];
     ScanTable scantable;
     const CIDEntry *cid_table;
@@ -95,7 +95,7 @@ static int dnxhd_init_vlc(DNXHDContext *ctx, int cid)
                  ctx->cid_table->run_bits, 1, 1,
                  ctx->cid_table->run_codes, 2, 2, 0);
 
-        ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable,
+        ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable,
                           ff_zigzag_direct);
         ctx->cid = cid;
     }
@@ -136,7 +136,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
         ctx->avctx->bits_per_raw_sample = 10;
         if (ctx->bit_depth != 10) {
             ff_blockdsp_init(&ctx->bdsp, ctx->avctx);
-            ff_dsputil_init(&ctx->dsp, ctx->avctx);
+            ff_idctdsp_init(&ctx->idsp, ctx->avctx);
             ctx->bit_depth = 10;
             ctx->decode_dct_block = dnxhd_decode_dct_block_10_444;
         }
@@ -146,7 +146,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
         ctx->avctx->bits_per_raw_sample = 10;
         if (ctx->bit_depth != 10) {
             ff_blockdsp_init(&ctx->bdsp, ctx->avctx);
-            ff_dsputil_init(&ctx->dsp, ctx->avctx);
+            ff_idctdsp_init(&ctx->idsp, ctx->avctx);
             ctx->bit_depth = 10;
             ctx->decode_dct_block = dnxhd_decode_dct_block_10;
         }
@@ -155,7 +155,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame,
         ctx->avctx->bits_per_raw_sample = 8;
         if (ctx->bit_depth != 8) {
             ff_blockdsp_init(&ctx->bdsp, ctx->avctx);
-            ff_dsputil_init(&ctx->dsp, ctx->avctx);
+            ff_idctdsp_init(&ctx->idsp, ctx->avctx);
             ctx->bit_depth = 8;
             ctx->decode_dct_block = dnxhd_decode_dct_block_8;
         }
@@ -340,34 +340,34 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame,
     dct_y_offset = dct_linesize_luma << 3;
     dct_x_offset = 8 << shift1;
     if (!ctx->is_444) {
-        ctx->dsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
-        ctx->dsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
-        ctx->dsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[4]);
-        ctx->dsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[5]);
+        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
+        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[4]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[5]);
 
         if (!(ctx->avctx->flags & CODEC_FLAG_GRAY)) {
             dct_y_offset = dct_linesize_chroma << 3;
-            ctx->dsp.idct_put(dest_u,                dct_linesize_chroma, ctx->blocks[2]);
-            ctx->dsp.idct_put(dest_v,                dct_linesize_chroma, ctx->blocks[3]);
-            ctx->dsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[6]);
-            ctx->dsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[7]);
+            ctx->idsp.idct_put(dest_u,                dct_linesize_chroma, ctx->blocks[2]);
+            ctx->idsp.idct_put(dest_v,                dct_linesize_chroma, ctx->blocks[3]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[6]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[7]);
         }
     } else {
-        ctx->dsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
-        ctx->dsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
-        ctx->dsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[6]);
-        ctx->dsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[7]);
+        ctx->idsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
+        ctx->idsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[6]);
+        ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[7]);
 
         if (!(ctx->avctx->flags & CODEC_FLAG_GRAY)) {
             dct_y_offset = dct_linesize_chroma << 3;
-            ctx->dsp.idct_put(dest_u,                               dct_linesize_chroma, ctx->blocks[2]);
-            ctx->dsp.idct_put(dest_u + dct_x_offset,                dct_linesize_chroma, ctx->blocks[3]);
-            ctx->dsp.idct_put(dest_u + dct_y_offset,                dct_linesize_chroma, ctx->blocks[8]);
-            ctx->dsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[9]);
-            ctx->dsp.idct_put(dest_v,                               dct_linesize_chroma, ctx->blocks[4]);
-            ctx->dsp.idct_put(dest_v + dct_x_offset,                dct_linesize_chroma, ctx->blocks[5]);
-            ctx->dsp.idct_put(dest_v + dct_y_offset,                dct_linesize_chroma, ctx->blocks[10]);
-            ctx->dsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[11]);
+            ctx->idsp.idct_put(dest_u,                               dct_linesize_chroma, ctx->blocks[2]);
+            ctx->idsp.idct_put(dest_u + dct_x_offset,                dct_linesize_chroma, ctx->blocks[3]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset,                dct_linesize_chroma, ctx->blocks[8]);
+            ctx->idsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[9]);
+            ctx->idsp.idct_put(dest_v,                               dct_linesize_chroma, ctx->blocks[4]);
+            ctx->idsp.idct_put(dest_v + dct_x_offset,                dct_linesize_chroma, ctx->blocks[5]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset,                dct_linesize_chroma, ctx->blocks[10]);
+            ctx->idsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[11]);
         }
     }
 
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index b85027b6d2..223791acbf 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -200,14 +200,14 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
 
     if (ctx->cid_table->bit_depth == 8) {
         for (i = 1; i < 64; i++) {
-            int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
+            int j = ctx->m.idsp.idct_permutation[ff_zigzag_direct[i]];
             weight_matrix[j] = ctx->cid_table->luma_weight[i];
         }
         ff_convert_matrix(&ctx->m, ctx->qmatrix_l, ctx->qmatrix_l16,
                           weight_matrix, ctx->m.intra_quant_bias, 1,
                           ctx->m.avctx->qmax, 1);
         for (i = 1; i < 64; i++) {
-            int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
+            int j = ctx->m.idsp.idct_permutation[ff_zigzag_direct[i]];
             weight_matrix[j] = ctx->cid_table->chroma_weight[i];
         }
         ff_convert_matrix(&ctx->m, ctx->qmatrix_c, ctx->qmatrix_c16,
@@ -228,7 +228,7 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
         // 10-bit
         for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
             for (i = 1; i < 64; i++) {
-                int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
+                int j = ctx->m.idsp.idct_permutation[ff_zigzag_direct[i]];
 
                 /* The quantization formula from the VC-3 standard is:
                  * quantized = sign(block[i]) * floor(abs(block[i]/s) * p /
@@ -308,6 +308,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
 
     ff_blockdsp_init(&ctx->bdsp, avctx);
     ff_dsputil_init(&ctx->m.dsp, avctx);
+    ff_idctdsp_init(&ctx->m.idsp, avctx);
     ff_dct_common_init(&ctx->m);
     if (!ctx->m.dct_quantize)
         ctx->m.dct_quantize = ff_dct_quantize_c;
@@ -634,7 +635,7 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg,
 
             if (avctx->mb_decision == FF_MB_DECISION_RD || !RC_VARIANCE) {
                 dnxhd_unquantize_c(ctx, block, i, qscale, last_index);
-                ctx->m.dsp.idct(block);
+                ctx->m.idsp.idct(block);
                 ssd += dnxhd_ssd_block(block, src_block);
             }
         }
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index ca0c8ef622..5e5ad93956 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -33,7 +33,6 @@
 #include "dsputil.h"
 #include "simple_idct.h"
 #include "faandct.h"
-#include "faanidct.h"
 #include "imgconvert.h"
 #include "mathops.h"
 #include "mpegvideo.h"
@@ -48,60 +47,6 @@ uint32_t ff_square_tab[512] = { 0, };
 #define BIT_DEPTH 8
 #include "dsputilenc_template.c"
 
-av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
-                               const uint8_t *src_scantable)
-{
-    int i, end;
-
-    st->scantable = src_scantable;
-
-    for (i = 0; i < 64; i++) {
-        int j = src_scantable[i];
-        st->permutated[i] = permutation[j];
-    }
-
-    end = -1;
-    for (i = 0; i < 64; i++) {
-        int j = st->permutated[i];
-        if (j > end)
-            end = j;
-        st->raster_end[i] = end;
-    }
-}
-
-av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
-                                           int idct_permutation_type)
-{
-    int i;
-
-    if (ARCH_X86)
-        if (ff_init_scantable_permutation_x86(idct_permutation,
-                                              idct_permutation_type))
-            return;
-
-    switch (idct_permutation_type) {
-    case FF_NO_IDCT_PERM:
-        for (i = 0; i < 64; i++)
-            idct_permutation[i] = i;
-        break;
-    case FF_LIBMPEG2_IDCT_PERM:
-        for (i = 0; i < 64; i++)
-            idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
-        break;
-    case FF_TRANSPOSE_IDCT_PERM:
-        for (i = 0; i < 64; i++)
-            idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
-        break;
-    case FF_PARTTRANS_IDCT_PERM:
-        for (i = 0; i < 64; i++)
-            idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
-        break;
-    default:
-        av_log(NULL, AV_LOG_ERROR,
-               "Internal error, IDCT permutation not set\n");
-    }
-}
-
 static int pix_sum_c(uint8_t *pix, int line_size)
 {
     int s = 0, i, j;
@@ -259,68 +204,6 @@ static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
     }
 }
 
-static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
-                                 int line_size)
-{
-    int i;
-
-    /* read the pixels */
-    for (i = 0; i < 8; i++) {
-        pixels[0] = av_clip_uint8(block[0]);
-        pixels[1] = av_clip_uint8(block[1]);
-        pixels[2] = av_clip_uint8(block[2]);
-        pixels[3] = av_clip_uint8(block[3]);
-        pixels[4] = av_clip_uint8(block[4]);
-        pixels[5] = av_clip_uint8(block[5]);
-        pixels[6] = av_clip_uint8(block[6]);
-        pixels[7] = av_clip_uint8(block[7]);
-
-        pixels += line_size;
-        block  += 8;
-    }
-}
-
-static void put_signed_pixels_clamped_c(const int16_t *block,
-                                        uint8_t *restrict pixels,
-                                        int line_size)
-{
-    int i, j;
-
-    for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-            if (*block < -128)
-                *pixels = 0;
-            else if (*block > 127)
-                *pixels = 255;
-            else
-                *pixels = (uint8_t) (*block + 128);
-            block++;
-            pixels++;
-        }
-        pixels += (line_size - 8);
-    }
-}
-
-static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
-                                 int line_size)
-{
-    int i;
-
-    /* read the pixels */
-    for (i = 0; i < 8; i++) {
-        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
-        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
-        pixels[2] = av_clip_uint8(pixels[2] + block[2]);
-        pixels[3] = av_clip_uint8(pixels[3] + block[3]);
-        pixels[4] = av_clip_uint8(pixels[4] + block[4]);
-        pixels[5] = av_clip_uint8(pixels[5] + block[5]);
-        pixels[6] = av_clip_uint8(pixels[6] + block[6]);
-        pixels[7] = av_clip_uint8(pixels[7] + block[7]);
-        pixels   += line_size;
-        block    += 8;
-    }
-}
-
 static int sum_abs_dctelem_c(int16_t *block)
 {
     int sum = 0, i;
@@ -967,7 +850,7 @@ static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
             s->dct_unquantize_inter(s, temp, 0, s->qscale);
     }
 
-    s->dsp.idct_add(lsrc2, 8, temp);
+    s->idsp.idct_add(lsrc2, 8, temp);
 
     distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
 
@@ -1138,18 +1021,6 @@ WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
 
-static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_j_rev_dct(block);
-    put_pixels_clamped_c(block, dest, line_size);
-}
-
-static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
-{
-    ff_j_rev_dct(block);
-    add_pixels_clamped_c(block, dest, line_size);
-}
-
 /* draw the edges of width 'w' of an image of size width, height */
 // FIXME: Check that this is OK for MPEG-4 interlaced.
 static void draw_edges_8_c(uint8_t *buf, int wrap, int width, int height,
@@ -1209,36 +1080,8 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
     }
 #endif /* CONFIG_ENCODERS */
 
-    if (avctx->bits_per_raw_sample == 10) {
-        c->idct_put              = ff_simple_idct_put_10;
-        c->idct_add              = ff_simple_idct_add_10;
-        c->idct                  = ff_simple_idct_10;
-        c->idct_permutation_type = FF_NO_IDCT_PERM;
-    } else {
-        if (avctx->idct_algo == FF_IDCT_INT) {
-            c->idct_put              = jref_idct_put;
-            c->idct_add              = jref_idct_add;
-            c->idct                  = ff_j_rev_dct;
-            c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
-        } else if (avctx->idct_algo == FF_IDCT_FAAN) {
-            c->idct_put              = ff_faanidct_put;
-            c->idct_add              = ff_faanidct_add;
-            c->idct                  = ff_faanidct;
-            c->idct_permutation_type = FF_NO_IDCT_PERM;
-        } else { // accurate/default
-            c->idct_put              = ff_simple_idct_put_8;
-            c->idct_add              = ff_simple_idct_add_8;
-            c->idct                  = ff_simple_idct_8;
-            c->idct_permutation_type = FF_NO_IDCT_PERM;
-        }
-    }
-
     c->diff_pixels = diff_pixels_c;
 
-    c->put_pixels_clamped        = put_pixels_clamped_c;
-    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
-    c->add_pixels_clamped        = add_pixels_clamped_c;
-
     c->sum_abs_dctelem = sum_abs_dctelem_c;
 
     c->pix_sum   = pix_sum_c;
@@ -1309,7 +1152,4 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
         ff_dsputil_init_ppc(c, avctx, high_bit_depth);
     if (ARCH_X86)
         ff_dsputil_init_x86(c, avctx, high_bit_depth);
-
-    ff_init_scantable_permutation(c->idct_permutation,
-                                  c->idct_permutation_type);
 }
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index b271dccf82..dfbca5a9f7 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -44,22 +44,6 @@ typedef int (*me_cmp_func)(struct MpegEncContext *c,
                            uint8_t *blk1 /* align width (8 or 16) */,
                            uint8_t *blk2 /* align 1 */, int line_size, int h);
 
-/**
- * Scantable.
- */
-typedef struct ScanTable {
-    const uint8_t *scantable;
-    uint8_t permutated[64];
-    uint8_t raster_end[64];
-} ScanTable;
-
-void ff_init_scantable(uint8_t *permutation, ScanTable *st,
-                       const uint8_t *src_scantable);
-void ff_init_scantable_permutation(uint8_t *idct_permutation,
-                                   int idct_permutation_type);
-int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
-                                      int idct_permutation_type);
-
 /**
  * DSPContext.
  */
@@ -72,15 +56,6 @@ typedef struct DSPContext {
                         const uint8_t *s1 /* align 8 */,
                         const uint8_t *s2 /* align 8 */,
                         int stride);
-    void (*put_pixels_clamped)(const int16_t *block /* align 16 */,
-                               uint8_t *pixels /* align 8 */,
-                               int line_size);
-    void (*put_signed_pixels_clamped)(const int16_t *block /* align 16 */,
-                                      uint8_t *pixels /* align 8 */,
-                                      int line_size);
-    void (*add_pixels_clamped)(const int16_t *block /* align 16 */,
-                               uint8_t *pixels /* align 8 */,
-                               int line_size);
     int (*sum_abs_dctelem)(int16_t *block /* align 16 */);
 
     int (*pix_sum)(uint8_t *pix, int line_size);
@@ -112,47 +87,6 @@ typedef struct DSPContext {
     void (*fdct)(int16_t *block /* align 16 */);
     void (*fdct248)(int16_t *block /* align 16 */);
 
-    /* IDCT really */
-    void (*idct)(int16_t *block /* align 16 */);
-
-    /**
-     * block -> idct -> clip to unsigned 8 bit -> dest.
-     * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
-     * @param line_size size in bytes of a horizontal line of dest
-     */
-    void (*idct_put)(uint8_t *dest /* align 8 */,
-                     int line_size, int16_t *block /* align 16 */);
-
-    /**
-     * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
-     * @param line_size size in bytes of a horizontal line of dest
-     */
-    void (*idct_add)(uint8_t *dest /* align 8 */,
-                     int line_size, int16_t *block /* align 16 */);
-
-    /**
-     * IDCT input permutation.
-     * Several optimized IDCTs need a permutated input (relative to the
-     * normal order of the reference IDCT).
-     * This permutation must be performed before the idct_put/add.
-     * Note, normally this can be merged with the zigzag/alternate scan<br>
-     * An example to avoid confusion:
-     * - (->decode coeffs -> zigzag reorder -> dequant -> reference IDCT -> ...)
-     * - (x -> reference DCT -> reference IDCT -> x)
-     * - (x -> reference DCT -> simple_mmx_perm = idct_permutation
-     *    -> simple_idct_mmx -> x)
-     * - (-> decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant
-     *    -> simple_idct_mmx -> ...)
-     */
-    uint8_t idct_permutation[64];
-    int idct_permutation_type;
-#define FF_NO_IDCT_PERM 1
-#define FF_LIBMPEG2_IDCT_PERM 2
-#define FF_SIMPLE_IDCT_PERM 3
-#define FF_TRANSPOSE_IDCT_PERM 4
-#define FF_PARTTRANS_IDCT_PERM 5
-#define FF_SSE2_IDCT_PERM 6
-
     int (*try_8x8basis)(int16_t rem[64], int16_t weight[64],
                         int16_t basis[64], int scale);
     void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
index 9a559dbd45..a03914d23e 100644
--- a/libavcodec/dvdec.c
+++ b/libavcodec/dvdec.c
@@ -39,6 +39,7 @@
 #include "libavutil/imgutils.h"
 #include "libavutil/pixdesc.h"
 #include "avcodec.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "get_bits.h"
 #include "put_bits.h"
@@ -61,17 +62,17 @@ static const int dv_iweight_bits = 14;
 static av_cold int dvvideo_decode_init(AVCodecContext *avctx)
 {
     DVVideoContext *s = avctx->priv_data;
-    DSPContext dsp;
+    IDCTDSPContext idsp;
     int i;
 
-    ff_dsputil_init(&dsp, avctx);
+    ff_idctdsp_init(&idsp, avctx);
 
     for (i = 0; i < 64; i++)
-       s->dv_zigzag[0][i] = dsp.idct_permutation[ff_zigzag_direct[i]];
+       s->dv_zigzag[0][i] = idsp.idct_permutation[ff_zigzag_direct[i]];
 
     memcpy(s->dv_zigzag[1], ff_dv_zigzag248_direct, sizeof(s->dv_zigzag[1]));
 
-    s->idct_put[0] = dsp.idct_put;
+    s->idct_put[0] = idsp.idct_put;
     s->idct_put[1] = ff_simple_idct248_put;
 
     return ff_dvvideo_init(avctx);
diff --git a/libavcodec/dxva2_mpeg2.c b/libavcodec/dxva2_mpeg2.c
index 044e669332..f6ef5e52ab 100644
--- a/libavcodec/dxva2_mpeg2.c
+++ b/libavcodec/dxva2_mpeg2.c
@@ -110,7 +110,7 @@ static void fill_quantization_matrices(AVCodecContext *avctx,
     for (i = 0; i < 4; i++)
         qm->bNewQmatrix[i] = 1;
     for (i = 0; i < 64; i++) {
-        int n = s->dsp.idct_permutation[ff_zigzag_direct[i]];
+        int n = s->idsp.idct_permutation[ff_zigzag_direct[i]];
         qm->Qmatrix[0][i] = s->intra_matrix[n];;
         qm->Qmatrix[1][i] = s->inter_matrix[n];;
         qm->Qmatrix[2][i] = s->chroma_intra_matrix[n];;
diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c
index 8fe1575a2f..9edf344857 100644
--- a/libavcodec/eamad.c
+++ b/libavcodec/eamad.c
@@ -33,6 +33,7 @@
 #include "get_bits.h"
 #include "aandcttab.h"
 #include "eaidct.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "mpeg12.h"
 #include "mpeg12data.h"
@@ -47,7 +48,7 @@ typedef struct MadContext {
     AVCodecContext *avctx;
     BlockDSPContext bdsp;
     BswapDSPContext bbdsp;
-    DSPContext dsp;
+    IDCTDSPContext idsp;
     AVFrame *last_frame;
     GetBitContext gb;
     void *bitstream_buf;
@@ -66,9 +67,9 @@ static av_cold int decode_init(AVCodecContext *avctx)
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     ff_blockdsp_init(&s->bdsp, avctx);
     ff_bswapdsp_init(&s->bbdsp);
-    ff_dsputil_init(&s->dsp, avctx);
-    ff_init_scantable_permutation(s->dsp.idct_permutation, FF_NO_IDCT_PERM);
-    ff_init_scantable(s->dsp.idct_permutation, &s->scantable, ff_zigzag_direct);
+    ff_idctdsp_init(&s->idsp, avctx);
+    ff_init_scantable_permutation(s->idsp.idct_permutation, FF_NO_IDCT_PERM);
+    ff_init_scantable(s->idsp.idct_permutation, &s->scantable, ff_zigzag_direct);
     ff_mpeg12_init_vlcs();
 
     s->last_frame = av_frame_alloc();
diff --git a/libavcodec/eatgq.c b/libavcodec/eatgq.c
index 1ead5f7adc..d8320c9f1d 100644
--- a/libavcodec/eatgq.c
+++ b/libavcodec/eatgq.c
@@ -32,7 +32,7 @@
 #define BITSTREAM_READER_LE
 #include "get_bits.h"
 #include "bytestream.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "aandcttab.h"
 #include "eaidct.h"
 #include "internal.h"
diff --git a/libavcodec/eatqi.c b/libavcodec/eatqi.c
index 36ec2e4ff3..60d80e9621 100644
--- a/libavcodec/eatqi.c
+++ b/libavcodec/eatqi.c
@@ -32,6 +32,7 @@
 #include "get_bits.h"
 #include "aandcttab.h"
 #include "eaidct.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "mpeg12.h"
 #include "mpegvideo.h"
@@ -51,9 +52,9 @@ static av_cold int tqi_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     ff_blockdsp_init(&s->bdsp, avctx);
     ff_bswapdsp_init(&t->bsdsp);
-    ff_dsputil_init(&s->dsp, avctx);
-    ff_init_scantable_permutation(s->dsp.idct_permutation, FF_NO_IDCT_PERM);
-    ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct);
+    ff_idctdsp_init(&s->idsp, avctx);
+    ff_init_scantable_permutation(s->idsp.idct_permutation, FF_NO_IDCT_PERM);
+    ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct);
     s->qscale = 1;
     avctx->time_base = (AVRational){1, 15};
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
diff --git a/libavcodec/g2meet.c b/libavcodec/g2meet.c
index 9660155619..456045dbc2 100644
--- a/libavcodec/g2meet.c
+++ b/libavcodec/g2meet.c
@@ -31,7 +31,7 @@
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "bytestream.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "get_bits.h"
 #include "internal.h"
 #include "mjpeg.h"
@@ -74,7 +74,7 @@ static const uint8_t chroma_quant[64] = {
 
 typedef struct JPGContext {
     BlockDSPContext bdsp;
-    DSPContext dsp;
+    IDCTDSPContext idsp;
     ScanTable  scantable;
 
     VLC        dc_vlc[2], ac_vlc[2];
@@ -153,8 +153,8 @@ static av_cold int jpg_init(AVCodecContext *avctx, JPGContext *c)
         return ret;
 
     ff_blockdsp_init(&c->bdsp, avctx);
-    ff_dsputil_init(&c->dsp, avctx);
-    ff_init_scantable(c->dsp.idct_permutation, &c->scantable,
+    ff_idctdsp_init(&c->idsp, avctx);
+    ff_init_scantable(c->idsp.idct_permutation, &c->scantable,
                       ff_zigzag_direct);
 
     return 0;
@@ -279,13 +279,13 @@ static int jpg_decode_data(JPGContext *c, int width, int height,
                     if ((ret = jpg_decode_block(c, &gb, 0,
                                                 c->block[i + j * 2])) != 0)
                         return ret;
-                    c->dsp.idct(c->block[i + j * 2]);
+                    c->idsp.idct(c->block[i + j * 2]);
                 }
             }
             for (i = 1; i < 3; i++) {
                 if ((ret = jpg_decode_block(c, &gb, i, c->block[i + 3])) != 0)
                     return ret;
-                c->dsp.idct(c->block[i + 3]);
+                c->idsp.idct(c->block[i + 3]);
             }
 
             for (j = 0; j < 16; j++) {
diff --git a/libavcodec/h263.c b/libavcodec/h263.c
index 6d5ffc0b23..9019548a9e 100644
--- a/libavcodec/h263.c
+++ b/libavcodec/h263.c
@@ -267,7 +267,7 @@ void ff_h263_pred_acdc(MpegEncContext * s, int16_t *block, int n)
             if (a != 1024) {
                 ac_val -= 16;
                 for(i=1;i<8;i++) {
-                    block[s->dsp.idct_permutation[i<<3]] += ac_val[i];
+                    block[s->idsp.idct_permutation[i << 3]] += ac_val[i];
                 }
                 pred_dc = a;
             }
@@ -276,7 +276,7 @@ void ff_h263_pred_acdc(MpegEncContext * s, int16_t *block, int n)
             if (c != 1024) {
                 ac_val -= 16 * wrap;
                 for(i=1;i<8;i++) {
-                    block[s->dsp.idct_permutation[i   ]] += ac_val[i + 8];
+                    block[s->idsp.idct_permutation[i]] += ac_val[i + 8];
                 }
                 pred_dc = c;
             }
@@ -304,10 +304,10 @@ void ff_h263_pred_acdc(MpegEncContext * s, int16_t *block, int n)
 
     /* left copy */
     for(i=1;i<8;i++)
-        ac_val1[i    ] = block[s->dsp.idct_permutation[i<<3]];
+        ac_val1[i]     = block[s->idsp.idct_permutation[i << 3]];
     /* top copy */
     for(i=1;i<8;i++)
-        ac_val1[8 + i] = block[s->dsp.idct_permutation[i   ]];
+        ac_val1[8 + i] = block[s->idsp.idct_permutation[i]];
 }
 
 int16_t *ff_h263_pred_motion(MpegEncContext * s, int block, int dir,
diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
new file mode 100644
index 0000000000..8542ab35aa
--- /dev/null
+++ b/libavcodec/idctdsp.c
@@ -0,0 +1,197 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/common.h"
+#include "avcodec.h"
+#include "dct.h"
+#include "faanidct.h"
+#include "idctdsp.h"
+#include "simple_idct.h"
+
+av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
+                               const uint8_t *src_scantable)
+{
+    int i, end;
+
+    st->scantable = src_scantable;
+
+    for (i = 0; i < 64; i++) {
+        int j = src_scantable[i];
+        st->permutated[i] = permutation[j];
+    }
+
+    end = -1;
+    for (i = 0; i < 64; i++) {
+        int j = st->permutated[i];
+        if (j > end)
+            end = j;
+        st->raster_end[i] = end;
+    }
+}
+
+av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
+                                           int idct_permutation_type)
+{
+    int i;
+
+    if (ARCH_X86)
+        if (ff_init_scantable_permutation_x86(idct_permutation,
+                                              idct_permutation_type))
+            return;
+
+    switch (idct_permutation_type) {
+    case FF_NO_IDCT_PERM:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = i;
+        break;
+    case FF_LIBMPEG2_IDCT_PERM:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
+        break;
+    case FF_TRANSPOSE_IDCT_PERM:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
+        break;
+    case FF_PARTTRANS_IDCT_PERM:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
+        break;
+    default:
+        av_log(NULL, AV_LOG_ERROR,
+               "Internal error, IDCT permutation not set\n");
+    }
+}
+
+static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
+                                 int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for (i = 0; i < 8; i++) {
+        pixels[0] = av_clip_uint8(block[0]);
+        pixels[1] = av_clip_uint8(block[1]);
+        pixels[2] = av_clip_uint8(block[2]);
+        pixels[3] = av_clip_uint8(block[3]);
+        pixels[4] = av_clip_uint8(block[4]);
+        pixels[5] = av_clip_uint8(block[5]);
+        pixels[6] = av_clip_uint8(block[6]);
+        pixels[7] = av_clip_uint8(block[7]);
+
+        pixels += line_size;
+        block  += 8;
+    }
+}
+
+static void put_signed_pixels_clamped_c(const int16_t *block,
+                                        uint8_t *restrict pixels,
+                                        int line_size)
+{
+    int i, j;
+
+    for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+            if (*block < -128)
+                *pixels = 0;
+            else if (*block > 127)
+                *pixels = 255;
+            else
+                *pixels = (uint8_t) (*block + 128);
+            block++;
+            pixels++;
+        }
+        pixels += (line_size - 8);
+    }
+}
+
+static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
+                                 int line_size)
+{
+    int i;
+
+    /* read the pixels */
+    for (i = 0; i < 8; i++) {
+        pixels[0] = av_clip_uint8(pixels[0] + block[0]);
+        pixels[1] = av_clip_uint8(pixels[1] + block[1]);
+        pixels[2] = av_clip_uint8(pixels[2] + block[2]);
+        pixels[3] = av_clip_uint8(pixels[3] + block[3]);
+        pixels[4] = av_clip_uint8(pixels[4] + block[4]);
+        pixels[5] = av_clip_uint8(pixels[5] + block[5]);
+        pixels[6] = av_clip_uint8(pixels[6] + block[6]);
+        pixels[7] = av_clip_uint8(pixels[7] + block[7]);
+        pixels   += line_size;
+        block    += 8;
+    }
+}
+
+static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct(block);
+    put_pixels_clamped_c(block, dest, line_size);
+}
+
+static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
+{
+    ff_j_rev_dct(block);
+    add_pixels_clamped_c(block, dest, line_size);
+}
+
+av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx)
+{
+    const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
+
+    if (avctx->bits_per_raw_sample == 10) {
+        c->idct_put              = ff_simple_idct_put_10;
+        c->idct_add              = ff_simple_idct_add_10;
+        c->idct                  = ff_simple_idct_10;
+        c->idct_permutation_type = FF_NO_IDCT_PERM;
+    } else {
+        if (avctx->idct_algo == FF_IDCT_INT) {
+            c->idct_put              = jref_idct_put;
+            c->idct_add              = jref_idct_add;
+            c->idct                  = ff_j_rev_dct;
+            c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
+        } else if (avctx->idct_algo == FF_IDCT_FAAN) {
+            c->idct_put              = ff_faanidct_put;
+            c->idct_add              = ff_faanidct_add;
+            c->idct                  = ff_faanidct;
+            c->idct_permutation_type = FF_NO_IDCT_PERM;
+        } else { // accurate/default
+            c->idct_put              = ff_simple_idct_put_8;
+            c->idct_add              = ff_simple_idct_add_8;
+            c->idct                  = ff_simple_idct_8;
+            c->idct_permutation_type = FF_NO_IDCT_PERM;
+        }
+    }
+
+    c->put_pixels_clamped        = put_pixels_clamped_c;
+    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
+    c->add_pixels_clamped        = add_pixels_clamped_c;
+
+    if (ARCH_ARM)
+        ff_idctdsp_init_arm(c, avctx, high_bit_depth);
+    if (ARCH_PPC)
+        ff_idctdsp_init_ppc(c, avctx, high_bit_depth);
+    if (ARCH_X86)
+        ff_idctdsp_init_x86(c, avctx, high_bit_depth);
+
+    ff_init_scantable_permutation(c->idct_permutation,
+                                  c->idct_permutation_type);
+}
diff --git a/libavcodec/idctdsp.h b/libavcodec/idctdsp.h
new file mode 100644
index 0000000000..e3a2317679
--- /dev/null
+++ b/libavcodec/idctdsp.h
@@ -0,0 +1,104 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_IDCTDSP_H
+#define AVCODEC_IDCTDSP_H
+
+#include <stdint.h>
+
+#include "avcodec.h"
+
+/**
+ * Scantable.
+ */
+typedef struct ScanTable {
+    const uint8_t *scantable;
+    uint8_t permutated[64];
+    uint8_t raster_end[64];
+} ScanTable;
+
+void ff_init_scantable(uint8_t *permutation, ScanTable *st,
+                       const uint8_t *src_scantable);
+void ff_init_scantable_permutation(uint8_t *idct_permutation,
+                                   int idct_permutation_type);
+int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
+                                      int idct_permutation_type);
+
+typedef struct IDCTDSPContext {
+    /* pixel ops : interface with DCT */
+    void (*put_pixels_clamped)(const int16_t *block /* align 16 */,
+                               uint8_t *pixels /* align 8 */,
+                               int line_size);
+    void (*put_signed_pixels_clamped)(const int16_t *block /* align 16 */,
+                                      uint8_t *pixels /* align 8 */,
+                                      int line_size);
+    void (*add_pixels_clamped)(const int16_t *block /* align 16 */,
+                               uint8_t *pixels /* align 8 */,
+                               int line_size);
+
+    void (*idct)(int16_t *block /* align 16 */);
+
+    /**
+     * block -> idct -> clip to unsigned 8 bit -> dest.
+     * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
+     * @param line_size size in bytes of a horizontal line of dest
+     */
+    void (*idct_put)(uint8_t *dest /* align 8 */,
+                     int line_size, int16_t *block /* align 16 */);
+
+    /**
+     * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
+     * @param line_size size in bytes of a horizontal line of dest
+     */
+    void (*idct_add)(uint8_t *dest /* align 8 */,
+                     int line_size, int16_t *block /* align 16 */);
+
+    /**
+     * IDCT input permutation.
+     * Several optimized IDCTs need a permutated input (relative to the
+     * normal order of the reference IDCT).
+     * This permutation must be performed before the idct_put/add.
+     * Note, normally this can be merged with the zigzag/alternate scan<br>
+     * An example to avoid confusion:
+     * - (->decode coeffs -> zigzag reorder -> dequant -> reference IDCT -> ...)
+     * - (x -> reference DCT -> reference IDCT -> x)
+     * - (x -> reference DCT -> simple_mmx_perm = idct_permutation
+     *    -> simple_idct_mmx -> x)
+     * - (-> decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant
+     *    -> simple_idct_mmx -> ...)
+     */
+    uint8_t idct_permutation[64];
+    int idct_permutation_type;
+#define FF_NO_IDCT_PERM 1
+#define FF_LIBMPEG2_IDCT_PERM 2
+#define FF_SIMPLE_IDCT_PERM 3
+#define FF_TRANSPOSE_IDCT_PERM 4
+#define FF_PARTTRANS_IDCT_PERM 5
+#define FF_SSE2_IDCT_PERM 6
+} IDCTDSPContext;
+
+void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx);
+
+void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
+                         unsigned high_bit_depth);
+void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx,
+                         unsigned high_bit_depth);
+void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
+                         unsigned high_bit_depth);
+
+#endif /* AVCODEC_IDCTDSP_H */
diff --git a/libavcodec/intrax8.c b/libavcodec/intrax8.c
index 2bda7233f7..d37eb793cd 100644
--- a/libavcodec/intrax8.c
+++ b/libavcodec/intrax8.c
@@ -24,6 +24,7 @@
 #include "avcodec.h"
 #include "error_resilience.h"
 #include "get_bits.h"
+#include "idctdsp.h"
 #include "mpegvideo.h"
 #include "msmpeg4data.h"
 #include "intrax8huf.h"
@@ -440,7 +441,7 @@ lut2[q>12][c]={
 static void x8_ac_compensation(IntraX8Context * const w, int const direction, int const dc_level){
     MpegEncContext * const s= w->s;
     int t;
-#define B(x,y)  s->block[0][s->dsp.idct_permutation[(x)+(y)*8]]
+#define B(x, y) s->block[0][s->idsp.idct_permutation[(x) + (y) * 8]]
 #define T(x)  ((x) * dc_level + 0x8000) >> 16;
     switch(direction){
     case 0:
@@ -646,9 +647,9 @@ static int x8_decode_intra_mb(IntraX8Context* const w, const int chroma){
                                             s->current_picture.f->linesize[!!chroma] );
     }
     if(!zeros_only)
-        s->dsp.idct_add ( s->dest[chroma],
-                          s->current_picture.f->linesize[!!chroma],
-                          s->block[0] );
+        s->idsp.idct_add(s->dest[chroma],
+                         s->current_picture.f->linesize[!!chroma],
+                         s->block[0]);
 
 block_placed:
 
@@ -698,9 +699,9 @@ av_cold void ff_intrax8_common_init(IntraX8Context * w, MpegEncContext * const s
     assert(s->mb_width>0);
     w->prediction_table=av_mallocz(s->mb_width*2*2);//two rows, 2 blocks per cannon mb
 
-    ff_init_scantable(s->dsp.idct_permutation, &w->scantable[0], ff_wmv1_scantable[0]);
-    ff_init_scantable(s->dsp.idct_permutation, &w->scantable[1], ff_wmv1_scantable[2]);
-    ff_init_scantable(s->dsp.idct_permutation, &w->scantable[2], ff_wmv1_scantable[3]);
+    ff_init_scantable(s->idsp.idct_permutation, &w->scantable[0], ff_wmv1_scantable[0]);
+    ff_init_scantable(s->idsp.idct_permutation, &w->scantable[1], ff_wmv1_scantable[2]);
+    ff_init_scantable(s->idsp.idct_permutation, &w->scantable[2], ff_wmv1_scantable[3]);
 
     ff_intrax8dsp_init(&w->dsp);
 }
diff --git a/libavcodec/ljpegenc.c b/libavcodec/ljpegenc.c
index 9f06818667..fbb024b9a6 100644
--- a/libavcodec/ljpegenc.c
+++ b/libavcodec/ljpegenc.c
@@ -35,7 +35,7 @@
 #include "libavutil/pixdesc.h"
 
 #include "avcodec.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "mjpegenc_common.h"
 #include "mpegvideo.h"
@@ -43,7 +43,7 @@
 #include "mjpegenc.h"
 
 typedef struct LJpegEncContext {
-    DSPContext dsp;
+    IDCTDSPContext idsp;
     ScanTable scantable;
     uint16_t matrix[64];
 
@@ -285,8 +285,9 @@ static av_cold int ljpeg_encode_init(AVCodecContext *avctx)
 
     s->scratch = av_malloc_array(avctx->width + 1, sizeof(*s->scratch));
 
-    ff_dsputil_init(&s->dsp, avctx);
-    ff_init_scantable(s->dsp.idct_permutation, &s->scantable, ff_zigzag_direct);
+    ff_idctdsp_init(&s->idsp, avctx);
+    ff_init_scantable(s->idsp.idct_permutation, &s->scantable,
+                      ff_zigzag_direct);
 
     av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift,
                                      &chroma_v_shift);
diff --git a/libavcodec/mdec.c b/libavcodec/mdec.c
index b4213972b7..6b70e37e76 100644
--- a/libavcodec/mdec.c
+++ b/libavcodec/mdec.c
@@ -29,6 +29,7 @@
 
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "idctdsp.h"
 #include "mpegvideo.h"
 #include "mpeg12.h"
 #include "thread.h"
@@ -36,7 +37,7 @@
 typedef struct MDECContext {
     AVCodecContext *avctx;
     BlockDSPContext bdsp;
-    DSPContext dsp;
+    IDCTDSPContext idsp;
     ThreadFrame frame;
     GetBitContext gb;
     ScanTable scantable;
@@ -146,14 +147,14 @@ static inline void idct_put(MDECContext *a, AVFrame *frame, int mb_x, int mb_y)
     uint8_t *dest_cb = frame->data[1] + (mb_y * 8 * frame->linesize[1]) + mb_x * 8;
     uint8_t *dest_cr = frame->data[2] + (mb_y * 8 * frame->linesize[2]) + mb_x * 8;
 
-    a->dsp.idct_put(dest_y,                    linesize, block[0]);
-    a->dsp.idct_put(dest_y                + 8, linesize, block[1]);
-    a->dsp.idct_put(dest_y + 8 * linesize,     linesize, block[2]);
-    a->dsp.idct_put(dest_y + 8 * linesize + 8, linesize, block[3]);
+    a->idsp.idct_put(dest_y,                    linesize, block[0]);
+    a->idsp.idct_put(dest_y + 8,                linesize, block[1]);
+    a->idsp.idct_put(dest_y + 8 * linesize,     linesize, block[2]);
+    a->idsp.idct_put(dest_y + 8 * linesize + 8, linesize, block[3]);
 
     if (!(a->avctx->flags & CODEC_FLAG_GRAY)) {
-        a->dsp.idct_put(dest_cb, frame->linesize[1], block[4]);
-        a->dsp.idct_put(dest_cr, frame->linesize[2], block[5]);
+        a->idsp.idct_put(dest_cb, frame->linesize[1], block[4]);
+        a->idsp.idct_put(dest_cr, frame->linesize[2], block[5]);
     }
 }
 
@@ -215,9 +216,10 @@ static av_cold int decode_init(AVCodecContext *avctx)
     a->avctx           = avctx;
 
     ff_blockdsp_init(&a->bdsp, avctx);
-    ff_dsputil_init(&a->dsp, avctx);
+    ff_idctdsp_init(&a->idsp, avctx);
     ff_mpeg12_init_vlcs();
-    ff_init_scantable(a->dsp.idct_permutation, &a->scantable, ff_zigzag_direct);
+    ff_init_scantable(a->idsp.idct_permutation, &a->scantable,
+                      ff_zigzag_direct);
 
     if (avctx->idct_algo == FF_IDCT_AUTO)
         avctx->idct_algo = FF_IDCT_SIMPLE;
diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c
index 4d21b5165e..88ee5d380c 100644
--- a/libavcodec/mimic.c
+++ b/libavcodec/mimic.c
@@ -29,8 +29,8 @@
 #include "get_bits.h"
 #include "bytestream.h"
 #include "bswapdsp.h"
-#include "dsputil.h"
 #include "hpeldsp.h"
+#include "idctdsp.h"
 #include "thread.h"
 
 #define MIMIC_HEADER_SIZE   20
@@ -56,8 +56,8 @@ typedef struct {
     ScanTable       scantable;
     BlockDSPContext bdsp;
     BswapDSPContext bbdsp;
-    DSPContext      dsp;
     HpelDSPContext  hdsp;
+    IDCTDSPContext  idsp;
     VLC             vlc;
 
     /* Kept in the context so multithreading can have a constant to read from */
@@ -151,9 +151,9 @@ static av_cold int mimic_decode_init(AVCodecContext *avctx)
     }
     ff_blockdsp_init(&ctx->bdsp, avctx);
     ff_bswapdsp_init(&ctx->bbdsp);
-    ff_dsputil_init(&ctx->dsp, avctx);
     ff_hpeldsp_init(&ctx->hdsp, avctx->flags);
-    ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable, col_zag);
+    ff_idctdsp_init(&ctx->idsp, avctx);
+    ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable, col_zag);
 
     for (i = 0; i < FF_ARRAY_ELEMS(ctx->frames); i++) {
         ctx->frames[i].f = av_frame_alloc();
@@ -302,7 +302,7 @@ static int decode(MimicContext *ctx, int quality, int num_coeffs,
                                    "block.\n");
                             return ret;
                         }
-                        ctx->dsp.idct_put(dst, stride, ctx->dct_block);
+                        ctx->idsp.idct_put(dst, stride, ctx->dct_block);
                     } else {
                         unsigned int backref = get_bits(&ctx->gb, 4);
                         int index            = (ctx->cur_index + backref) & 15;
diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c
index f674539507..d9a73d8426 100644
--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -36,6 +36,7 @@
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "blockdsp.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "mjpeg.h"
 #include "mjpegdec.h"
@@ -95,8 +96,9 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx)
     s->avctx = avctx;
     ff_blockdsp_init(&s->bdsp, avctx);
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
-    ff_dsputil_init(&s->dsp, avctx);
-    ff_init_scantable(s->dsp.idct_permutation, &s->scantable, ff_zigzag_direct);
+    ff_idctdsp_init(&s->idsp, avctx);
+    ff_init_scantable(s->idsp.idct_permutation, &s->scantable,
+                      ff_zigzag_direct);
     s->buffer_size   = 0;
     s->buffer        = NULL;
     s->start_code    = -1;
@@ -889,7 +891,7 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah,
                                        "error y=%d x=%d\n", mb_y, mb_x);
                                 return AVERROR_INVALIDDATA;
                             }
-                            s->dsp.idct_put(ptr, linesize[c], s->block);
+                            s->idsp.idct_put(ptr, linesize[c], s->block);
                         }
                     } else {
                         int block_idx  = s->block_stride[c] * (v * mb_y + y) +
@@ -1002,7 +1004,7 @@ static int mjpeg_decode_scan_progressive_ac(MJpegDecodeContext *s, int ss,
                                                  reference_data + block_offset,
                                                  linesize, 8);
                 } else {
-                    s->dsp.idct_put(ptr, linesize, *block);
+                    s->idsp.idct_put(ptr, linesize, *block);
                     ptr += 8;
                 }
             }
diff --git a/libavcodec/mjpegdec.h b/libavcodec/mjpegdec.h
index 0d1dd9ee03..aa4703a24d 100644
--- a/libavcodec/mjpegdec.h
+++ b/libavcodec/mjpegdec.h
@@ -35,8 +35,8 @@
 #include "avcodec.h"
 #include "blockdsp.h"
 #include "get_bits.h"
-#include "dsputil.h"
 #include "hpeldsp.h"
+#include "idctdsp.h"
 
 #define MAX_COMPONENTS 4
 
@@ -97,8 +97,8 @@ typedef struct MJpegDecodeContext {
     uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode)
     ScanTable scantable;
     BlockDSPContext bdsp;
-    DSPContext dsp;
     HpelDSPContext hdsp;
+    IDCTDSPContext idsp;
 
     int restart_interval;
     int restart_count;
diff --git a/libavcodec/mjpegenc_common.c b/libavcodec/mjpegenc_common.c
index 3dba414f1d..adb335e5e1 100644
--- a/libavcodec/mjpegenc_common.c
+++ b/libavcodec/mjpegenc_common.c
@@ -26,7 +26,7 @@
 #include "libavutil/pixfmt.h"
 
 #include "avcodec.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "put_bits.h"
 #include "mjpegenc_common.h"
 #include "mjpeg.h"
diff --git a/libavcodec/mjpegenc_common.h b/libavcodec/mjpegenc_common.h
index 57dc9ddb5b..b48911e364 100644
--- a/libavcodec/mjpegenc_common.h
+++ b/libavcodec/mjpegenc_common.h
@@ -24,7 +24,7 @@
 #include <stdint.h>
 
 #include "avcodec.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "put_bits.h"
 
 void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb,
diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
index 0bf3c20c37..aa98454d05 100644
--- a/libavcodec/mpeg12dec.c
+++ b/libavcodec/mpeg12dec.c
@@ -33,8 +33,8 @@
 
 #include "avcodec.h"
 #include "bytestream.h"
-#include "dsputil.h"
 #include "error_resilience.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "mpeg_er.h"
 #include "mpeg12.h"
@@ -1100,7 +1100,7 @@ static av_cold int mpeg_decode_init(AVCodecContext *avctx)
     /* we need some permutation to store matrices,
      * until MPV_common_init() sets the real permutation. */
     for (i = 0; i < 64; i++)
-        s2->dsp.idct_permutation[i] = i;
+        s2->idsp.idct_permutation[i] = i;
 
     ff_MPV_decode_defaults(s2);
 
@@ -1309,15 +1309,15 @@ static int mpeg_decode_postinit(AVCodecContext *avctx)
 
         /* Quantization matrices may need reordering
          * if DCT permutation is changed. */
-        memcpy(old_permutation, s->dsp.idct_permutation, 64 * sizeof(uint8_t));
+        memcpy(old_permutation, s->idsp.idct_permutation, 64 * sizeof(uint8_t));
 
         if (ff_MPV_common_init(s) < 0)
             return -2;
 
-        quant_matrix_rebuild(s->intra_matrix,        old_permutation, s->dsp.idct_permutation);
-        quant_matrix_rebuild(s->inter_matrix,        old_permutation, s->dsp.idct_permutation);
-        quant_matrix_rebuild(s->chroma_intra_matrix, old_permutation, s->dsp.idct_permutation);
-        quant_matrix_rebuild(s->chroma_inter_matrix, old_permutation, s->dsp.idct_permutation);
+        quant_matrix_rebuild(s->intra_matrix,        old_permutation, s->idsp.idct_permutation);
+        quant_matrix_rebuild(s->inter_matrix,        old_permutation, s->idsp.idct_permutation);
+        quant_matrix_rebuild(s->chroma_intra_matrix, old_permutation, s->idsp.idct_permutation);
+        quant_matrix_rebuild(s->chroma_inter_matrix, old_permutation, s->idsp.idct_permutation);
 
         s1->mpeg_enc_ctx_allocated = 1;
     }
@@ -1469,7 +1469,7 @@ static int load_matrix(MpegEncContext *s, uint16_t matrix0[64],
     int i;
 
     for (i = 0; i < 64; i++) {
-        int j = s->dsp.idct_permutation[ff_zigzag_direct[i]];
+        int j = s->idsp.idct_permutation[ff_zigzag_direct[i]];
         int v = get_bits(&s->gb, 8);
         if (v == 0) {
             av_log(s->avctx, AV_LOG_ERROR, "matrix damaged\n");
@@ -1561,11 +1561,11 @@ static void mpeg_decode_picture_coding_extension(Mpeg1Context *s1)
     }
 
     if (s->alternate_scan) {
-        ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan);
     } else {
-        ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable, ff_zigzag_direct);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct);
+        ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_zigzag_direct);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct);
     }
 
     /* composite display not parsed */
@@ -2070,7 +2070,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
         load_matrix(s, s->chroma_intra_matrix, s->intra_matrix, 1);
     } else {
         for (i = 0; i < 64; i++) {
-            j = s->dsp.idct_permutation[i];
+            j = s->idsp.idct_permutation[i];
             v = ff_mpeg1_default_intra_matrix[i];
             s->intra_matrix[j]        = v;
             s->chroma_intra_matrix[j] = v;
@@ -2080,7 +2080,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx,
         load_matrix(s, s->chroma_inter_matrix, s->inter_matrix, 0);
     } else {
         for (i = 0; i < 64; i++) {
-            int j = s->dsp.idct_permutation[i];
+            int j = s->idsp.idct_permutation[i];
             v = ff_mpeg1_default_non_intra_matrix[i];
             s->inter_matrix[j]        = v;
             s->chroma_inter_matrix[j] = v;
@@ -2142,7 +2142,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx)
     s1->mpeg_enc_ctx_allocated = 1;
 
     for (i = 0; i < 64; i++) {
-        int j = s->dsp.idct_permutation[i];
+        int j = s->idsp.idct_permutation[i];
         v = ff_mpeg1_default_intra_matrix[i];
         s->intra_matrix[j]        = v;
         s->chroma_intra_matrix[j] = v;
diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
index 0e3e5803b1..a4e7725753 100644
--- a/libavcodec/mpeg4videodec.c
+++ b/libavcodec/mpeg4videodec.c
@@ -21,6 +21,7 @@
  */
 
 #include "error_resilience.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "mpegutils.h"
 #include "mpegvideo.h"
@@ -71,11 +72,11 @@ void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n, int dir)
                 n == 1 || n == 3) {
                 /* same qscale */
                 for (i = 1; i < 8; i++)
-                    block[s->dsp.idct_permutation[i << 3]] += ac_val[i];
+                    block[s->idsp.idct_permutation[i << 3]] += ac_val[i];
             } else {
                 /* different qscale, we must rescale */
                 for (i = 1; i < 8; i++)
-                    block[s->dsp.idct_permutation[i << 3]] += ROUNDED_DIV(ac_val[i] * qscale_table[xy], s->qscale);
+                    block[s->idsp.idct_permutation[i << 3]] += ROUNDED_DIV(ac_val[i] * qscale_table[xy], s->qscale);
             }
         } else {
             const int xy = s->mb_x + s->mb_y * s->mb_stride - s->mb_stride;
@@ -86,21 +87,21 @@ void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n, int dir)
                 n == 2 || n == 3) {
                 /* same qscale */
                 for (i = 1; i < 8; i++)
-                    block[s->dsp.idct_permutation[i]] += ac_val[i + 8];
+                    block[s->idsp.idct_permutation[i]] += ac_val[i + 8];
             } else {
                 /* different qscale, we must rescale */
                 for (i = 1; i < 8; i++)
-                    block[s->dsp.idct_permutation[i]] += ROUNDED_DIV(ac_val[i + 8] * qscale_table[xy], s->qscale);
+                    block[s->idsp.idct_permutation[i]] += ROUNDED_DIV(ac_val[i + 8] * qscale_table[xy], s->qscale);
             }
         }
     }
     /* left copy */
     for (i = 1; i < 8; i++)
-        ac_val1[i] = block[s->dsp.idct_permutation[i << 3]];
+        ac_val1[i] = block[s->idsp.idct_permutation[i << 3]];
 
     /* top copy */
     for (i = 1; i < 8; i++)
-        ac_val1[8 + i] = block[s->dsp.idct_permutation[i]];
+        ac_val1[8 + i] = block[s->idsp.idct_permutation[i]];
 }
 
 /**
@@ -1815,7 +1816,7 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
 
             /* load default matrixes */
             for (i = 0; i < 64; i++) {
-                int j = s->dsp.idct_permutation[i];
+                int j = s->idsp.idct_permutation[i];
                 v = ff_mpeg4_default_intra_matrix[i];
                 s->intra_matrix[j]        = v;
                 s->chroma_intra_matrix[j] = v;
@@ -1835,14 +1836,14 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                         break;
 
                     last = v;
-                    j = s->dsp.idct_permutation[ff_zigzag_direct[i]];
+                    j = s->idsp.idct_permutation[ff_zigzag_direct[i]];
                     s->intra_matrix[j]        = last;
                     s->chroma_intra_matrix[j] = last;
                 }
 
                 /* replicate last value */
                 for (; i < 64; i++) {
-                    int j = s->dsp.idct_permutation[ff_zigzag_direct[i]];
+                    int j = s->idsp.idct_permutation[ff_zigzag_direct[i]];
                     s->intra_matrix[j]        = last;
                     s->chroma_intra_matrix[j] = last;
                 }
@@ -1858,14 +1859,14 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb)
                         break;
 
                     last = v;
-                    j = s->dsp.idct_permutation[ff_zigzag_direct[i]];
+                    j = s->idsp.idct_permutation[ff_zigzag_direct[i]];
                     s->inter_matrix[j]        = v;
                     s->chroma_inter_matrix[j] = v;
                 }
 
                 /* replicate last value */
                 for (; i < 64; i++) {
-                    int j = s->dsp.idct_permutation[ff_zigzag_direct[i]];
+                    int j = s->idsp.idct_permutation[ff_zigzag_direct[i]];
                     s->inter_matrix[j]        = last;
                     s->chroma_inter_matrix[j] = last;
                 }
@@ -2219,15 +2220,15 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb)
     }
 
     if (s->alternate_scan) {
-        ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable,   ff_alternate_vertical_scan);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable,   ff_alternate_vertical_scan);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_h_scantable, ff_alternate_vertical_scan);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable,   ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable,   ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan);
     } else {
-        ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable,   ff_zigzag_direct);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable,   ff_zigzag_direct);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_h_scantable, ff_alternate_horizontal_scan);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable,   ff_zigzag_direct);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable,   ff_zigzag_direct);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_alternate_horizontal_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan);
     }
 
     if (s->pict_type == AV_PICTURE_TYPE_S &&
diff --git a/libavcodec/mpeg4videoenc.c b/libavcodec/mpeg4videoenc.c
index b95752fe49..f120932443 100644
--- a/libavcodec/mpeg4videoenc.c
+++ b/libavcodec/mpeg4videoenc.c
@@ -110,11 +110,11 @@ static inline void restore_ac_coeffs(MpegEncContext *s, int16_t block[6][64],
         if (dir[n]) {
             /* top prediction */
             for (i = 1; i < 8; i++)
-                block[n][s->dsp.idct_permutation[i]] = ac_val[i + 8];
+                block[n][s->idsp.idct_permutation[i]] = ac_val[i + 8];
         } else {
             /* left prediction */
             for (i = 1; i < 8; i++)
-                block[n][s->dsp.idct_permutation[i << 3]] = ac_val[i];
+                block[n][s->idsp.idct_permutation[i << 3]] = ac_val[i];
         }
     }
 }
@@ -152,17 +152,17 @@ static inline int decide_ac_pred(MpegEncContext *s, int16_t block[6][64],
             if (s->mb_y == 0 || s->qscale == qscale_table[xy] || n == 2 || n == 3) {
                 /* same qscale */
                 for (i = 1; i < 8; i++) {
-                    const int level = block[n][s->dsp.idct_permutation[i]];
-                    block[n][s->dsp.idct_permutation[i]] = level - ac_val[i + 8];
-                    ac_val1[i]     = block[n][s->dsp.idct_permutation[i << 3]];
+                    const int level = block[n][s->idsp.idct_permutation[i]];
+                    block[n][s->idsp.idct_permutation[i]] = level - ac_val[i + 8];
+                    ac_val1[i]     = block[n][s->idsp.idct_permutation[i << 3]];
                     ac_val1[i + 8] = level;
                 }
             } else {
                 /* different qscale, we must rescale */
                 for (i = 1; i < 8; i++) {
-                    const int level = block[n][s->dsp.idct_permutation[i]];
-                    block[n][s->dsp.idct_permutation[i]] = level - ROUNDED_DIV(ac_val[i + 8] * qscale_table[xy], s->qscale);
-                    ac_val1[i]     = block[n][s->dsp.idct_permutation[i << 3]];
+                    const int level = block[n][s->idsp.idct_permutation[i]];
+                    block[n][s->idsp.idct_permutation[i]] = level - ROUNDED_DIV(ac_val[i + 8] * qscale_table[xy], s->qscale);
+                    ac_val1[i]     = block[n][s->idsp.idct_permutation[i << 3]];
                     ac_val1[i + 8] = level;
                 }
             }
@@ -174,18 +174,18 @@ static inline int decide_ac_pred(MpegEncContext *s, int16_t block[6][64],
             if (s->mb_x == 0 || s->qscale == qscale_table[xy] || n == 1 || n == 3) {
                 /* same qscale */
                 for (i = 1; i < 8; i++) {
-                    const int level = block[n][s->dsp.idct_permutation[i << 3]];
-                    block[n][s->dsp.idct_permutation[i << 3]] = level - ac_val[i];
+                    const int level = block[n][s->idsp.idct_permutation[i << 3]];
+                    block[n][s->idsp.idct_permutation[i << 3]] = level - ac_val[i];
                     ac_val1[i]     = level;
-                    ac_val1[i + 8] = block[n][s->dsp.idct_permutation[i]];
+                    ac_val1[i + 8] = block[n][s->idsp.idct_permutation[i]];
                 }
             } else {
                 /* different qscale, we must rescale */
                 for (i = 1; i < 8; i++) {
-                    const int level = block[n][s->dsp.idct_permutation[i << 3]];
-                    block[n][s->dsp.idct_permutation[i << 3]] = level - ROUNDED_DIV(ac_val[i] * qscale_table[xy], s->qscale);
+                    const int level = block[n][s->idsp.idct_permutation[i << 3]];
+                    block[n][s->idsp.idct_permutation[i << 3]] = level - ROUNDED_DIV(ac_val[i] * qscale_table[xy], s->qscale);
                     ac_val1[i]     = level;
-                    ac_val1[i + 8] = block[n][s->dsp.idct_permutation[i]];
+                    ac_val1[i + 8] = block[n][s->idsp.idct_permutation[i]];
                 }
             }
             st[n] = s->intra_v_scantable.permutated;
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index fb63d6afda..a4a37d4931 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -34,7 +34,7 @@
 #include "libavutil/timer.h"
 #include "avcodec.h"
 #include "blockdsp.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "mathops.h"
 #include "mpegutils.h"
@@ -380,6 +380,7 @@ av_cold int ff_dct_common_init(MpegEncContext *s)
     ff_blockdsp_init(&s->bdsp, s->avctx);
     ff_dsputil_init(&s->dsp, s->avctx);
     ff_hpeldsp_init(&s->hdsp, s->avctx->flags);
+    ff_idctdsp_init(&s->idsp, s->avctx);
     ff_mpegvideodsp_init(&s->mdsp);
     ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample);
 
@@ -403,14 +404,14 @@ av_cold int ff_dct_common_init(MpegEncContext *s)
      * note: only wmv uses different ones
      */
     if (s->alternate_scan) {
-        ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable  , ff_alternate_vertical_scan);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable  , ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan);
     } else {
-        ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable  , ff_zigzag_direct);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable  , ff_zigzag_direct);
+        ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_zigzag_direct);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct);
     }
-    ff_init_scantable(s->dsp.idct_permutation, &s->intra_h_scantable, ff_alternate_horizontal_scan);
-    ff_init_scantable(s->dsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan);
+    ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_alternate_horizontal_scan);
+    ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan);
 
     return 0;
 }
@@ -2041,7 +2042,7 @@ static inline void put_dct(MpegEncContext *s,
                            int16_t *block, int i, uint8_t *dest, int line_size, int qscale)
 {
     s->dct_unquantize_intra(s, block, i, qscale);
-    s->dsp.idct_put (dest, line_size, block);
+    s->idsp.idct_put(dest, line_size, block);
 }
 
 /* add block[] to dest[] */
@@ -2049,7 +2050,7 @@ static inline void add_dct(MpegEncContext *s,
                            int16_t *block, int i, uint8_t *dest, int line_size)
 {
     if (s->block_last_index[i] >= 0) {
-        s->dsp.idct_add (dest, line_size, block);
+        s->idsp.idct_add(dest, line_size, block);
     }
 }
 
@@ -2059,7 +2060,7 @@ static inline void add_dequant_dct(MpegEncContext *s,
     if (s->block_last_index[i] >= 0) {
         s->dct_unquantize_inter(s, block, i, qscale);
 
-        s->dsp.idct_add (dest, line_size, block);
+        s->idsp.idct_add(dest, line_size, block);
     }
 }
 
@@ -2127,7 +2128,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
        av_log(s->avctx, AV_LOG_DEBUG, "DCT coeffs of MB at %dx%d:\n", s->mb_x, s->mb_y);
        for(i=0; i<6; i++){
            for(j=0; j<64; j++){
-               av_log(s->avctx, AV_LOG_DEBUG, "%5d", block[i][s->dsp.idct_permutation[j]]);
+               av_log(s->avctx, AV_LOG_DEBUG, "%5d",
+                      block[i][s->idsp.idct_permutation[j]]);
            }
            av_log(s->avctx, AV_LOG_DEBUG, "\n");
        }
@@ -2304,29 +2306,29 @@ FF_ENABLE_DEPRECATION_WARNINGS
                     }
                 }
             }else{
-                s->dsp.idct_put(dest_y                          , dct_linesize, block[0]);
-                s->dsp.idct_put(dest_y              + block_size, dct_linesize, block[1]);
-                s->dsp.idct_put(dest_y + dct_offset             , dct_linesize, block[2]);
-                s->dsp.idct_put(dest_y + dct_offset + block_size, dct_linesize, block[3]);
+                s->idsp.idct_put(dest_y,                           dct_linesize, block[0]);
+                s->idsp.idct_put(dest_y              + block_size, dct_linesize, block[1]);
+                s->idsp.idct_put(dest_y + dct_offset,              dct_linesize, block[2]);
+                s->idsp.idct_put(dest_y + dct_offset + block_size, dct_linesize, block[3]);
 
                 if(!CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                     if(s->chroma_y_shift){
-                        s->dsp.idct_put(dest_cb, uvlinesize, block[4]);
-                        s->dsp.idct_put(dest_cr, uvlinesize, block[5]);
+                        s->idsp.idct_put(dest_cb, uvlinesize, block[4]);
+                        s->idsp.idct_put(dest_cr, uvlinesize, block[5]);
                     }else{
 
                         dct_linesize = uvlinesize << s->interlaced_dct;
                         dct_offset   = s->interlaced_dct ? uvlinesize : uvlinesize * 8;
 
-                        s->dsp.idct_put(dest_cb,              dct_linesize, block[4]);
-                        s->dsp.idct_put(dest_cr,              dct_linesize, block[5]);
-                        s->dsp.idct_put(dest_cb + dct_offset, dct_linesize, block[6]);
-                        s->dsp.idct_put(dest_cr + dct_offset, dct_linesize, block[7]);
+                        s->idsp.idct_put(dest_cb,              dct_linesize, block[4]);
+                        s->idsp.idct_put(dest_cr,              dct_linesize, block[5]);
+                        s->idsp.idct_put(dest_cb + dct_offset, dct_linesize, block[6]);
+                        s->idsp.idct_put(dest_cr + dct_offset, dct_linesize, block[7]);
                         if(!s->chroma_x_shift){//Chroma444
-                            s->dsp.idct_put(dest_cb + 8,              dct_linesize, block[8]);
-                            s->dsp.idct_put(dest_cr + 8,              dct_linesize, block[9]);
-                            s->dsp.idct_put(dest_cb + 8 + dct_offset, dct_linesize, block[10]);
-                            s->dsp.idct_put(dest_cr + 8 + dct_offset, dct_linesize, block[11]);
+                            s->idsp.idct_put(dest_cb + 8,              dct_linesize, block[8]);
+                            s->idsp.idct_put(dest_cr + 8,              dct_linesize, block[9]);
+                            s->idsp.idct_put(dest_cb + 8 + dct_offset, dct_linesize, block[10]);
+                            s->idsp.idct_put(dest_cr + 8 + dct_offset, dct_linesize, block[11]);
                         }
                     }
                 }//gray
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index 191dac0d3a..27c72dafe3 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -35,6 +35,7 @@
 #include "get_bits.h"
 #include "h263dsp.h"
 #include "hpeldsp.h"
+#include "idctdsp.h"
 #include "mpegvideodsp.h"
 #include "put_bits.h"
 #include "ratecontrol.h"
@@ -352,6 +353,7 @@ typedef struct MpegEncContext {
     BlockDSPContext bdsp;
     DSPContext dsp;             ///< pointers for accelerated dsp functions
     HpelDSPContext hdsp;
+    IDCTDSPContext idsp;
     MpegVideoDSPContext mdsp;
     QpelDSPContext qdsp;
     VideoDSPContext vdsp;
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index 3baf37a5be..65e2a8c8c7 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -38,6 +38,7 @@
 #include "avcodec.h"
 #include "dct.h"
 #include "dsputil.h"
+#include "idctdsp.h"
 #include "mpeg12.h"
 #include "mpegvideo.h"
 #include "h261.h"
@@ -86,7 +87,7 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
             dsp->fdct == ff_jpeg_fdct_islow_10 ||
             dsp->fdct == ff_faandct) {
             for (i = 0; i < 64; i++) {
-                const int j = dsp->idct_permutation[i];
+                const int j = s->idsp.idct_permutation[i];
                 /* 16 <= qscale * quant_matrix[i] <= 7905
                  * Assume x = ff_aanscales[i] * qscale * quant_matrix[i]
                  *             19952 <=              x  <= 249205026
@@ -98,7 +99,7 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
             }
         } else if (dsp->fdct == ff_fdct_ifast) {
             for (i = 0; i < 64; i++) {
-                const int j = dsp->idct_permutation[i];
+                const int j = s->idsp.idct_permutation[i];
                 /* 16 <= qscale * quant_matrix[i] <= 7905
                  * Assume x = ff_aanscales[i] * qscale * quant_matrix[i]
                  *             19952 <=              x  <= 249205026
@@ -111,7 +112,7 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64],
             }
         } else {
             for (i = 0; i < 64; i++) {
-                const int j = dsp->idct_permutation[i];
+                const int j = s->idsp.idct_permutation[i];
                 /* We can safely suppose that 16 <= quant_matrix[i] <= 255
                  * Assume x = qscale * quant_matrix[i]
                  * So             16 <=              x  <= 7905
@@ -755,7 +756,7 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx)
 
     /* init q matrix */
     for (i = 0; i < 64; i++) {
-        int j = s->dsp.idct_permutation[i];
+        int j = s->idsp.idct_permutation[i];
         if (CONFIG_MPEG4_ENCODER && s->codec_id == AV_CODEC_ID_MPEG4 &&
             s->mpeg_quant) {
             s->intra_matrix[j] = ff_mpeg4_default_intra_matrix[i];
@@ -3360,7 +3361,7 @@ static int encode_picture(MpegEncContext *s, int picture_number)
     if (s->out_format == FMT_MJPEG) {
         /* for mjpeg, we do include qscale in the matrix */
         for(i=1;i<64;i++){
-            int j= s->dsp.idct_permutation[i];
+            int j = s->idsp.idct_permutation[i];
 
             s->intra_matrix[j] = av_clip_uint8((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
         }
@@ -3589,7 +3590,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s,
             if(s->out_format == FMT_H263){
                 unquant_coeff= alevel*qmul + qadd;
             }else{ //MPEG1
-                j= s->dsp.idct_permutation[ scantable[i] ]; //FIXME optimize
+                j = s->idsp.idct_permutation[scantable[i]]; // FIXME: optimize
                 if(s->mb_intra){
                         unquant_coeff = (int)(  alevel  * qscale * s->intra_matrix[j]) >> 3;
                         unquant_coeff =   (unquant_coeff - 1) | 1;
@@ -3795,7 +3796,7 @@ static int messed_sign=0;
 #endif
 
     if(basis[0][0] == 0)
-        build_basis(s->dsp.idct_permutation);
+        build_basis(s->idsp.idct_permutation);
 
     qmul= qscale*2;
     qadd= (qscale-1)|1;
@@ -4214,8 +4215,9 @@ int ff_dct_quantize_c(MpegEncContext *s,
     *overflow= s->max_qcoeff < max; //overflow might have happened
 
     /* we need this permutation so that we correct the IDCT, we only permute the !=0 elements */
-    if (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)
-        ff_block_permute(block, s->dsp.idct_permutation, scantable, last_non_zero);
+    if (s->idsp.idct_permutation_type != FF_NO_IDCT_PERM)
+        ff_block_permute(block, s->idsp.idct_permutation,
+                         scantable, last_non_zero);
 
     return last_non_zero;
 }
diff --git a/libavcodec/mpegvideo_xvmc.c b/libavcodec/mpegvideo_xvmc.c
index aa6f49ade6..a8e068b76c 100644
--- a/libavcodec/mpegvideo_xvmc.c
+++ b/libavcodec/mpegvideo_xvmc.c
@@ -307,7 +307,7 @@ void ff_xvmc_decode_mb(MpegEncContext *s)
             if (s->mb_intra && (render->idct || !render->unsigned_intra))
                 *s->pblocks[i][0] -= 1 << 10;
             if (!render->idct) {
-                s->dsp.idct(*s->pblocks[i]);
+                s->idsp.idct(*s->pblocks[i]);
                 /* It is unclear if MC hardware requires pixel diff values to be
                  * in the range [-255;255]. TODO: Clipping if such hardware is
                  * ever found. As of now it would only be an unnecessary
diff --git a/libavcodec/msmpeg4.c b/libavcodec/msmpeg4.c
index f0eaa9bb5b..95b5c93ea3 100644
--- a/libavcodec/msmpeg4.c
+++ b/libavcodec/msmpeg4.c
@@ -28,7 +28,7 @@
  */
 
 #include "avcodec.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "mpegvideo.h"
 #include "msmpeg4.h"
 #include "libavutil/x86/asm.h"
@@ -136,10 +136,10 @@ av_cold void ff_msmpeg4_common_init(MpegEncContext *s)
 
 
     if(s->msmpeg4_version>=4){
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable  , ff_wmv1_scantable[1]);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_h_scantable, ff_wmv1_scantable[2]);
-        ff_init_scantable(s->dsp.idct_permutation, &s->intra_v_scantable, ff_wmv1_scantable[3]);
-        ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable  , ff_wmv1_scantable[0]);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable,   ff_wmv1_scantable[1]);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_wmv1_scantable[2]);
+        ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_wmv1_scantable[3]);
+        ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable,   ff_wmv1_scantable[0]);
     }
     //Note the default tables are set in common_init in mpegvideo.c
 
diff --git a/libavcodec/nuv.c b/libavcodec/nuv.c
index 1bd848d519..c31ff11222 100644
--- a/libavcodec/nuv.c
+++ b/libavcodec/nuv.c
@@ -28,6 +28,7 @@
 #include "libavutil/lzo.h"
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "rtjpeg.h"
 
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index c6c0bcb241..ee0c18c09e 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264DSP)                 += ppc/h264dsp.o
 OBJS-$(CONFIG_H264QPEL)                += ppc/h264qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += ppc/hpeldsp_altivec.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += ppc/huffyuvdsp_altivec.o
+OBJS-$(CONFIG_IDCTDSP)                 += ppc/idctdsp.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += ppc/mpegaudiodsp_altivec.o
 OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o      \
                                           ppc/mpegvideodsp.o
@@ -24,7 +25,6 @@ OBJS-$(CONFIG_VP8_DECODER)             += ppc/vp8dsp_altivec.o
 
 ALTIVEC-OBJS-$(CONFIG_DSPUTIL)         += ppc/dsputil_altivec.o         \
                                           ppc/fdct_altivec.o            \
-                                          ppc/idct_altivec.o            \
 
 FFT-OBJS-$(HAVE_GNU_AS)                += ppc/fft_altivec_s.o
 ALTIVEC-OBJS-$(CONFIG_FFT)             += $(FFT-OBJS-yes)
diff --git a/libavcodec/ppc/dsputil_altivec.h b/libavcodec/ppc/dsputil_altivec.h
index 42da933dfa..be5fd58669 100644
--- a/libavcodec/ppc/dsputil_altivec.h
+++ b/libavcodec/ppc/dsputil_altivec.h
@@ -29,9 +29,6 @@
 
 void ff_fdct_altivec(int16_t *block);
 
-void ff_idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
-void ff_idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
-
 void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx,
                              unsigned high_bit_depth);
 
diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c
index 778d3e1247..b54111310e 100644
--- a/libavcodec/ppc/dsputil_ppc.c
+++ b/libavcodec/ppc/dsputil_ppc.c
@@ -42,12 +42,6 @@ av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx,
                 c->fdct = ff_fdct_altivec;
             }
 #endif //CONFIG_ENCODERS
-            if ((avctx->idct_algo == FF_IDCT_AUTO) ||
-                (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
-                c->idct_put              = ff_idct_put_altivec;
-                c->idct_add              = ff_idct_add_altivec;
-                c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
-            }
         }
     }
 }
diff --git a/libavcodec/ppc/idct_altivec.c b/libavcodec/ppc/idct_altivec.c
deleted file mode 100644
index 82fd9296f0..0000000000
--- a/libavcodec/ppc/idct_altivec.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2001 Michel Lespinasse
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/* NOTE: This code is based on GPL code from the libmpeg2 project.  The
- * author, Michel Lespinasses, has given explicit permission to release
- * under LGPL as part of Libav.
- *
- * Libav integration by Dieter Shirley
- *
- * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
- * project.  I've deleted all of the libmpeg2-specific code, renamed the
- * functions and reordered the function parameters.  The only change to the
- * IDCT function itself was to factor out the partial transposition, and to
- * perform a full transpose at the end of the function. */
-
-#include <stdlib.h>
-#include <string.h>
-#include "config.h"
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
-#include "libavutil/ppc/types_altivec.h"
-#include "dsputil_altivec.h"
-
-#define IDCT_HALF                                       \
-    /* 1st stage */                                     \
-    t1 = vec_mradds(a1, vx7, vx1);                      \
-    t8 = vec_mradds(a1, vx1, vec_subs(zero, vx7));      \
-    t7 = vec_mradds(a2, vx5, vx3);                      \
-    t3 = vec_mradds(ma2, vx3, vx5);                     \
-                                                        \
-    /* 2nd stage */                                     \
-    t5 = vec_adds(vx0, vx4);                            \
-    t0 = vec_subs(vx0, vx4);                            \
-    t2 = vec_mradds(a0, vx6, vx2);                      \
-    t4 = vec_mradds(a0, vx2, vec_subs(zero, vx6));      \
-    t6 = vec_adds(t8, t3);                              \
-    t3 = vec_subs(t8, t3);                              \
-    t8 = vec_subs(t1, t7);                              \
-    t1 = vec_adds(t1, t7);                              \
-                                                        \
-    /* 3rd stage */                                     \
-    t7 = vec_adds(t5, t2);                              \
-    t2 = vec_subs(t5, t2);                              \
-    t5 = vec_adds(t0, t4);                              \
-    t0 = vec_subs(t0, t4);                              \
-    t4 = vec_subs(t8, t3);                              \
-    t3 = vec_adds(t8, t3);                              \
-                                                        \
-    /* 4th stage */                                     \
-    vy0 = vec_adds(t7, t1);                             \
-    vy7 = vec_subs(t7, t1);                             \
-    vy1 = vec_mradds(c4, t3, t5);                       \
-    vy6 = vec_mradds(mc4, t3, t5);                      \
-    vy2 = vec_mradds(c4, t4, t0);                       \
-    vy5 = vec_mradds(mc4, t4, t0);                      \
-    vy3 = vec_adds(t2, t6);                             \
-    vy4 = vec_subs(t2, t6)
-
-#define IDCT                                                                \
-    vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                         \
-    vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8;                             \
-                                                                            \
-    vec_s16 c4   = vec_splat(constants[0], 0);                              \
-    vec_s16 a0   = vec_splat(constants[0], 1);                              \
-    vec_s16 a1   = vec_splat(constants[0], 2);                              \
-    vec_s16 a2   = vec_splat(constants[0], 3);                              \
-    vec_s16 mc4  = vec_splat(constants[0], 4);                              \
-    vec_s16 ma2  = vec_splat(constants[0], 5);                              \
-    vec_s16 bias = (vec_s16) vec_splat((vec_s32) constants[0], 3);          \
-                                                                            \
-    vec_s16 zero  = vec_splat_s16(0);                                       \
-    vec_u16 shift = vec_splat_u16(4);                                       \
-                                                                            \
-    vec_s16 vx0 = vec_mradds(vec_sl(block[0], shift), constants[1], zero);  \
-    vec_s16 vx1 = vec_mradds(vec_sl(block[1], shift), constants[2], zero);  \
-    vec_s16 vx2 = vec_mradds(vec_sl(block[2], shift), constants[3], zero);  \
-    vec_s16 vx3 = vec_mradds(vec_sl(block[3], shift), constants[4], zero);  \
-    vec_s16 vx4 = vec_mradds(vec_sl(block[4], shift), constants[1], zero);  \
-    vec_s16 vx5 = vec_mradds(vec_sl(block[5], shift), constants[4], zero);  \
-    vec_s16 vx6 = vec_mradds(vec_sl(block[6], shift), constants[3], zero);  \
-    vec_s16 vx7 = vec_mradds(vec_sl(block[7], shift), constants[2], zero);  \
-                                                                            \
-    IDCT_HALF;                                                              \
-                                                                            \
-    vx0 = vec_mergeh(vy0, vy4);                                             \
-    vx1 = vec_mergel(vy0, vy4);                                             \
-    vx2 = vec_mergeh(vy1, vy5);                                             \
-    vx3 = vec_mergel(vy1, vy5);                                             \
-    vx4 = vec_mergeh(vy2, vy6);                                             \
-    vx5 = vec_mergel(vy2, vy6);                                             \
-    vx6 = vec_mergeh(vy3, vy7);                                             \
-    vx7 = vec_mergel(vy3, vy7);                                             \
-                                                                            \
-    vy0 = vec_mergeh(vx0, vx4);                                             \
-    vy1 = vec_mergel(vx0, vx4);                                             \
-    vy2 = vec_mergeh(vx1, vx5);                                             \
-    vy3 = vec_mergel(vx1, vx5);                                             \
-    vy4 = vec_mergeh(vx2, vx6);                                             \
-    vy5 = vec_mergel(vx2, vx6);                                             \
-    vy6 = vec_mergeh(vx3, vx7);                                             \
-    vy7 = vec_mergel(vx3, vx7);                                             \
-                                                                            \
-    vx0 = vec_adds(vec_mergeh(vy0, vy4), bias);                             \
-    vx1 = vec_mergel(vy0, vy4);                                             \
-    vx2 = vec_mergeh(vy1, vy5);                                             \
-    vx3 = vec_mergel(vy1, vy5);                                             \
-    vx4 = vec_mergeh(vy2, vy6);                                             \
-    vx5 = vec_mergel(vy2, vy6);                                             \
-    vx6 = vec_mergeh(vy3, vy7);                                             \
-    vx7 = vec_mergel(vy3, vy7);                                             \
-                                                                            \
-    IDCT_HALF;                                                              \
-                                                                            \
-    shift = vec_splat_u16(6);                                               \
-    vx0 = vec_sra(vy0, shift);                                              \
-    vx1 = vec_sra(vy1, shift);                                              \
-    vx2 = vec_sra(vy2, shift);                                              \
-    vx3 = vec_sra(vy3, shift);                                              \
-    vx4 = vec_sra(vy4, shift);                                              \
-    vx5 = vec_sra(vy5, shift);                                              \
-    vx6 = vec_sra(vy6, shift);                                              \
-    vx7 = vec_sra(vy7, shift)
-
-static const vec_s16 constants[5] = {
-    { 23170, 13573,  6518, 21895, -23170, -21895,    32,    31 },
-    { 16384, 22725, 21407, 19266,  16384,  19266, 21407, 22725 },
-    { 22725, 31521, 29692, 26722,  22725,  26722, 29692, 31521 },
-    { 21407, 29692, 27969, 25172,  21407,  25172, 27969, 29692 },
-    { 19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722 }
-};
-
-void ff_idct_put_altivec(uint8_t *dest, int stride, int16_t *blk)
-{
-    vec_s16 *block = (vec_s16 *) blk;
-    vec_u8 tmp;
-
-    IDCT;
-
-#define COPY(dest, src)                                     \
-    tmp = vec_packsu(src, src);                             \
-    vec_ste((vec_u32) tmp, 0, (unsigned int *) dest);       \
-    vec_ste((vec_u32) tmp, 4, (unsigned int *) dest)
-
-    COPY(dest, vx0);
-    dest += stride;
-    COPY(dest, vx1);
-    dest += stride;
-    COPY(dest, vx2);
-    dest += stride;
-    COPY(dest, vx3);
-    dest += stride;
-    COPY(dest, vx4);
-    dest += stride;
-    COPY(dest, vx5);
-    dest += stride;
-    COPY(dest, vx6);
-    dest += stride;
-    COPY(dest, vx7);
-}
-
-void ff_idct_add_altivec(uint8_t *dest, int stride, int16_t *blk)
-{
-    vec_s16 *block = (vec_s16 *) blk;
-    vec_u8 tmp;
-    vec_s16 tmp2, tmp3;
-    vec_u8 perm0;
-    vec_u8 perm1;
-    vec_u8 p0, p1, p;
-
-    IDCT;
-
-    p0    = vec_lvsl(0, dest);
-    p1    = vec_lvsl(stride, dest);
-    p     = vec_splat_u8(-1);
-    perm0 = vec_mergeh(p, p0);
-    perm1 = vec_mergeh(p, p1);
-
-#define ADD(dest, src, perm)                                \
-    /* *(uint64_t *) &tmp = *(uint64_t *) dest; */          \
-    tmp  = vec_ld(0, dest);                                 \
-    tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, perm);    \
-    tmp3 = vec_adds(tmp2, src);                             \
-    tmp  = vec_packsu(tmp3, tmp3);                          \
-    vec_ste((vec_u32) tmp, 0, (unsigned int *) dest);       \
-    vec_ste((vec_u32) tmp, 4, (unsigned int *) dest)
-
-    ADD(dest, vx0, perm0);
-    dest += stride;
-    ADD(dest, vx1, perm1);
-    dest += stride;
-    ADD(dest, vx2, perm0);
-    dest += stride;
-    ADD(dest, vx3, perm1);
-    dest += stride;
-    ADD(dest, vx4, perm0);
-    dest += stride;
-    ADD(dest, vx5, perm1);
-    dest += stride;
-    ADD(dest, vx6, perm0);
-    dest += stride;
-    ADD(dest, vx7, perm1);
-}
diff --git a/libavcodec/ppc/idctdsp.c b/libavcodec/ppc/idctdsp.c
new file mode 100644
index 0000000000..8a1d2903d8
--- /dev/null
+++ b/libavcodec/ppc/idctdsp.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2001 Michel Lespinasse
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* NOTE: This code is based on GPL code from the libmpeg2 project.  The
+ * author, Michel Lespinasses, has given explicit permission to release
+ * under LGPL as part of Libav.
+ *
+ * Libav integration by Dieter Shirley
+ *
+ * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
+ * project.  I've deleted all of the libmpeg2-specific code, renamed the
+ * functions and reordered the function parameters.  The only change to the
+ * IDCT function itself was to factor out the partial transposition, and to
+ * perform a full transpose at the end of the function. */
+
+#include <stdlib.h>
+#include <string.h>
+#include "config.h"
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/cpu.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavcodec/idctdsp.h"
+
+#if HAVE_ALTIVEC
+
+#define IDCT_HALF                                       \
+    /* 1st stage */                                     \
+    t1 = vec_mradds(a1, vx7, vx1);                      \
+    t8 = vec_mradds(a1, vx1, vec_subs(zero, vx7));      \
+    t7 = vec_mradds(a2, vx5, vx3);                      \
+    t3 = vec_mradds(ma2, vx3, vx5);                     \
+                                                        \
+    /* 2nd stage */                                     \
+    t5 = vec_adds(vx0, vx4);                            \
+    t0 = vec_subs(vx0, vx4);                            \
+    t2 = vec_mradds(a0, vx6, vx2);                      \
+    t4 = vec_mradds(a0, vx2, vec_subs(zero, vx6));      \
+    t6 = vec_adds(t8, t3);                              \
+    t3 = vec_subs(t8, t3);                              \
+    t8 = vec_subs(t1, t7);                              \
+    t1 = vec_adds(t1, t7);                              \
+                                                        \
+    /* 3rd stage */                                     \
+    t7 = vec_adds(t5, t2);                              \
+    t2 = vec_subs(t5, t2);                              \
+    t5 = vec_adds(t0, t4);                              \
+    t0 = vec_subs(t0, t4);                              \
+    t4 = vec_subs(t8, t3);                              \
+    t3 = vec_adds(t8, t3);                              \
+                                                        \
+    /* 4th stage */                                     \
+    vy0 = vec_adds(t7, t1);                             \
+    vy7 = vec_subs(t7, t1);                             \
+    vy1 = vec_mradds(c4, t3, t5);                       \
+    vy6 = vec_mradds(mc4, t3, t5);                      \
+    vy2 = vec_mradds(c4, t4, t0);                       \
+    vy5 = vec_mradds(mc4, t4, t0);                      \
+    vy3 = vec_adds(t2, t6);                             \
+    vy4 = vec_subs(t2, t6)
+
+#define IDCT                                                                \
+    vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                         \
+    vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8;                             \
+                                                                            \
+    vec_s16 c4   = vec_splat(constants[0], 0);                              \
+    vec_s16 a0   = vec_splat(constants[0], 1);                              \
+    vec_s16 a1   = vec_splat(constants[0], 2);                              \
+    vec_s16 a2   = vec_splat(constants[0], 3);                              \
+    vec_s16 mc4  = vec_splat(constants[0], 4);                              \
+    vec_s16 ma2  = vec_splat(constants[0], 5);                              \
+    vec_s16 bias = (vec_s16) vec_splat((vec_s32) constants[0], 3);          \
+                                                                            \
+    vec_s16 zero  = vec_splat_s16(0);                                       \
+    vec_u16 shift = vec_splat_u16(4);                                       \
+                                                                            \
+    vec_s16 vx0 = vec_mradds(vec_sl(block[0], shift), constants[1], zero);  \
+    vec_s16 vx1 = vec_mradds(vec_sl(block[1], shift), constants[2], zero);  \
+    vec_s16 vx2 = vec_mradds(vec_sl(block[2], shift), constants[3], zero);  \
+    vec_s16 vx3 = vec_mradds(vec_sl(block[3], shift), constants[4], zero);  \
+    vec_s16 vx4 = vec_mradds(vec_sl(block[4], shift), constants[1], zero);  \
+    vec_s16 vx5 = vec_mradds(vec_sl(block[5], shift), constants[4], zero);  \
+    vec_s16 vx6 = vec_mradds(vec_sl(block[6], shift), constants[3], zero);  \
+    vec_s16 vx7 = vec_mradds(vec_sl(block[7], shift), constants[2], zero);  \
+                                                                            \
+    IDCT_HALF;                                                              \
+                                                                            \
+    vx0 = vec_mergeh(vy0, vy4);                                             \
+    vx1 = vec_mergel(vy0, vy4);                                             \
+    vx2 = vec_mergeh(vy1, vy5);                                             \
+    vx3 = vec_mergel(vy1, vy5);                                             \
+    vx4 = vec_mergeh(vy2, vy6);                                             \
+    vx5 = vec_mergel(vy2, vy6);                                             \
+    vx6 = vec_mergeh(vy3, vy7);                                             \
+    vx7 = vec_mergel(vy3, vy7);                                             \
+                                                                            \
+    vy0 = vec_mergeh(vx0, vx4);                                             \
+    vy1 = vec_mergel(vx0, vx4);                                             \
+    vy2 = vec_mergeh(vx1, vx5);                                             \
+    vy3 = vec_mergel(vx1, vx5);                                             \
+    vy4 = vec_mergeh(vx2, vx6);                                             \
+    vy5 = vec_mergel(vx2, vx6);                                             \
+    vy6 = vec_mergeh(vx3, vx7);                                             \
+    vy7 = vec_mergel(vx3, vx7);                                             \
+                                                                            \
+    vx0 = vec_adds(vec_mergeh(vy0, vy4), bias);                             \
+    vx1 = vec_mergel(vy0, vy4);                                             \
+    vx2 = vec_mergeh(vy1, vy5);                                             \
+    vx3 = vec_mergel(vy1, vy5);                                             \
+    vx4 = vec_mergeh(vy2, vy6);                                             \
+    vx5 = vec_mergel(vy2, vy6);                                             \
+    vx6 = vec_mergeh(vy3, vy7);                                             \
+    vx7 = vec_mergel(vy3, vy7);                                             \
+                                                                            \
+    IDCT_HALF;                                                              \
+                                                                            \
+    shift = vec_splat_u16(6);                                               \
+    vx0 = vec_sra(vy0, shift);                                              \
+    vx1 = vec_sra(vy1, shift);                                              \
+    vx2 = vec_sra(vy2, shift);                                              \
+    vx3 = vec_sra(vy3, shift);                                              \
+    vx4 = vec_sra(vy4, shift);                                              \
+    vx5 = vec_sra(vy5, shift);                                              \
+    vx6 = vec_sra(vy6, shift);                                              \
+    vx7 = vec_sra(vy7, shift)
+
+static const vec_s16 constants[5] = {
+    { 23170, 13573,  6518, 21895, -23170, -21895,    32,    31 },
+    { 16384, 22725, 21407, 19266,  16384,  19266, 21407, 22725 },
+    { 22725, 31521, 29692, 26722,  22725,  26722, 29692, 31521 },
+    { 21407, 29692, 27969, 25172,  21407,  25172, 27969, 29692 },
+    { 19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722 }
+};
+
+static void idct_put_altivec(uint8_t *dest, int stride, int16_t *blk)
+{
+    vec_s16 *block = (vec_s16 *) blk;
+    vec_u8 tmp;
+
+    IDCT;
+
+#define COPY(dest, src)                                     \
+    tmp = vec_packsu(src, src);                             \
+    vec_ste((vec_u32) tmp, 0, (unsigned int *) dest);       \
+    vec_ste((vec_u32) tmp, 4, (unsigned int *) dest)
+
+    COPY(dest, vx0);
+    dest += stride;
+    COPY(dest, vx1);
+    dest += stride;
+    COPY(dest, vx2);
+    dest += stride;
+    COPY(dest, vx3);
+    dest += stride;
+    COPY(dest, vx4);
+    dest += stride;
+    COPY(dest, vx5);
+    dest += stride;
+    COPY(dest, vx6);
+    dest += stride;
+    COPY(dest, vx7);
+}
+
+static void idct_add_altivec(uint8_t *dest, int stride, int16_t *blk)
+{
+    vec_s16 *block = (vec_s16 *) blk;
+    vec_u8 tmp;
+    vec_s16 tmp2, tmp3;
+    vec_u8 perm0;
+    vec_u8 perm1;
+    vec_u8 p0, p1, p;
+
+    IDCT;
+
+    p0    = vec_lvsl(0, dest);
+    p1    = vec_lvsl(stride, dest);
+    p     = vec_splat_u8(-1);
+    perm0 = vec_mergeh(p, p0);
+    perm1 = vec_mergeh(p, p1);
+
+#define ADD(dest, src, perm)                                \
+    /* *(uint64_t *) &tmp = *(uint64_t *) dest; */          \
+    tmp  = vec_ld(0, dest);                                 \
+    tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, perm);    \
+    tmp3 = vec_adds(tmp2, src);                             \
+    tmp  = vec_packsu(tmp3, tmp3);                          \
+    vec_ste((vec_u32) tmp, 0, (unsigned int *) dest);       \
+    vec_ste((vec_u32) tmp, 4, (unsigned int *) dest)
+
+    ADD(dest, vx0, perm0);
+    dest += stride;
+    ADD(dest, vx1, perm1);
+    dest += stride;
+    ADD(dest, vx2, perm0);
+    dest += stride;
+    ADD(dest, vx3, perm1);
+    dest += stride;
+    ADD(dest, vx4, perm0);
+    dest += stride;
+    ADD(dest, vx5, perm1);
+    dest += stride;
+    ADD(dest, vx6, perm0);
+    dest += stride;
+    ADD(dest, vx7, perm1);
+}
+
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx,
+                                 unsigned high_bit_depth)
+{
+#if HAVE_ALTIVEC
+    if (PPC_ALTIVEC(av_get_cpu_flags())) {
+        if (!high_bit_depth) {
+            if ((avctx->idct_algo == FF_IDCT_AUTO) ||
+                (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
+                c->idct_add              = idct_add_altivec;
+                c->idct_put              = idct_put_altivec;
+                c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+            }
+        }
+    }
+#endif /* HAVE_ALTIVEC */
+}
diff --git a/libavcodec/proresdec.c b/libavcodec/proresdec.c
index 144fa26f8c..03f63d9dce 100644
--- a/libavcodec/proresdec.c
+++ b/libavcodec/proresdec.c
@@ -34,7 +34,7 @@
 
 #include "libavutil/intmath.h"
 #include "avcodec.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "internal.h"
 #include "proresdata.h"
 #include "proresdsp.h"
diff --git a/libavcodec/proresdsp.c b/libavcodec/proresdsp.c
index 1d60897cc9..1d92d360d2 100644
--- a/libavcodec/proresdsp.c
+++ b/libavcodec/proresdsp.c
@@ -23,7 +23,7 @@
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "proresdsp.h"
 #include "simple_idct.h"
 
diff --git a/libavcodec/rtjpeg.c b/libavcodec/rtjpeg.c
index 3188e6f6aa..67eeff8f4a 100644
--- a/libavcodec/rtjpeg.c
+++ b/libavcodec/rtjpeg.c
@@ -121,7 +121,7 @@ int ff_rtjpeg_decode_frame_yuv420(RTJpegContext *c, AVFrame *f,
     if (res < 0) \
         return res; \
     if (res > 0) \
-        c->dsp.idct_put(dst, stride, block); \
+        c->idsp.idct_put(dst, stride, block); \
 } while (0)
             int16_t *block = c->block;
             BLOCK(c->lquant, y1, f->linesize[0]);
@@ -159,7 +159,7 @@ void ff_rtjpeg_decode_init(RTJpegContext *c, int width, int height,
                            const uint32_t *lquant, const uint32_t *cquant) {
     int i;
     for (i = 0; i < 64; i++) {
-        int p = c->dsp.idct_permutation[i];
+        int p = c->idsp.idct_permutation[i];
         c->lquant[p] = lquant[i];
         c->cquant[p] = cquant[i];
     }
@@ -171,13 +171,13 @@ void ff_rtjpeg_init(RTJpegContext *c, AVCodecContext *avctx)
 {
     int i;
 
-    ff_dsputil_init(&c->dsp, avctx);
+    ff_idctdsp_init(&c->idsp, avctx);
 
     for (i = 0; i < 64; i++) {
         int z = ff_zigzag_direct[i];
         z = ((z << 3) | (z >> 3)) & 63; // rtjpeg uses a transposed variant
 
         // permute the scan and quantization tables for the chosen idct
-        c->scan[i] = c->dsp.idct_permutation[z];
+        c->scan[i] = c->idsp.idct_permutation[z];
     }
 }
diff --git a/libavcodec/rtjpeg.h b/libavcodec/rtjpeg.h
index 23609b3eb9..cd300797c5 100644
--- a/libavcodec/rtjpeg.h
+++ b/libavcodec/rtjpeg.h
@@ -23,15 +23,16 @@
 #define AVCODEC_RTJPEG_H
 
 #include <stdint.h>
-#include "dsputil.h"
+
 #include "libavutil/mem.h"
+#include "idctdsp.h"
 
 #define RTJPEG_FILE_VERSION 0
 #define RTJPEG_HEADER_SIZE 12
 
 typedef struct RTJpegContext {
     int w, h;
-    DSPContext dsp;
+    IDCTDSPContext idsp;
     uint8_t scan[64];
     uint32_t lquant[64];
     uint32_t cquant[64];
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index 6d6c1ec4fa..c83bb4fb77 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -109,24 +109,24 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
                 fieldtx = v->fieldtx_plane[topleft_mb_pos];
             stride_y       = s->linesize << fieldtx;
             v_dist         = (16 - fieldtx) >> (fieldtx == 0);
-            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][0],
-                                             s->dest[0] - 16 * s->linesize - 16,
-                                             stride_y);
-            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][1],
-                                             s->dest[0] - 16 * s->linesize - 8,
-                                             stride_y);
-            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][2],
-                                             s->dest[0] - v_dist * s->linesize - 16,
-                                             stride_y);
-            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][3],
-                                             s->dest[0] - v_dist * s->linesize - 8,
-                                             stride_y);
-            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][4],
-                                             s->dest[1] - 8 * s->uvlinesize - 8,
-                                             s->uvlinesize);
-            s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][5],
-                                             s->dest[2] - 8 * s->uvlinesize - 8,
-                                             s->uvlinesize);
+            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][0],
+                                              s->dest[0] - 16 * s->linesize - 16,
+                                              stride_y);
+            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][1],
+                                              s->dest[0] - 16 * s->linesize - 8,
+                                              stride_y);
+            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][2],
+                                              s->dest[0] - v_dist * s->linesize - 16,
+                                              stride_y);
+            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][3],
+                                              s->dest[0] - v_dist * s->linesize - 8,
+                                              stride_y);
+            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][4],
+                                              s->dest[1] - 8 * s->uvlinesize - 8,
+                                              s->uvlinesize);
+            s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][5],
+                                              s->dest[2] - 8 * s->uvlinesize - 8,
+                                              s->uvlinesize);
         }
         if (s->mb_x == s->mb_width - 1) {
             top_mb_pos = (s->mb_y - 1) * s->mb_stride + s->mb_x;
@@ -134,24 +134,24 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v)
                 fieldtx = v->fieldtx_plane[top_mb_pos];
             stride_y   = s->linesize << fieldtx;
             v_dist     = fieldtx ? 15 : 8;
-            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][0],
-                                             s->dest[0] - 16 * s->linesize,
-                                             stride_y);
-            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][1],
-                                             s->dest[0] - 16 * s->linesize + 8,
-                                             stride_y);
-            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][2],
-                                             s->dest[0] - v_dist * s->linesize,
-                                             stride_y);
-            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][3],
-                                             s->dest[0] - v_dist * s->linesize + 8,
-                                             stride_y);
-            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][4],
-                                             s->dest[1] - 8 * s->uvlinesize,
-                                             s->uvlinesize);
-            s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][5],
-                                             s->dest[2] - 8 * s->uvlinesize,
-                                             s->uvlinesize);
+            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][0],
+                                              s->dest[0] - 16 * s->linesize,
+                                              stride_y);
+            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][1],
+                                              s->dest[0] - 16 * s->linesize + 8,
+                                              stride_y);
+            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][2],
+                                              s->dest[0] - v_dist * s->linesize,
+                                              stride_y);
+            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][3],
+                                              s->dest[0] - v_dist * s->linesize + 8,
+                                              stride_y);
+            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][4],
+                                              s->dest[1] - 8 * s->uvlinesize,
+                                              s->uvlinesize);
+            s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][5],
+                                              s->dest[2] - 8 * s->uvlinesize,
+                                              s->uvlinesize);
         }
     }
 
@@ -3280,7 +3280,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
                 v->vc1dsp.vc1_inv_trans_8x8_dc(dst, linesize, block);
             else {
                 v->vc1dsp.vc1_inv_trans_8x8(block);
-                s->dsp.add_pixels_clamped(block, dst, linesize);
+                s->idsp.add_pixels_clamped(block, dst, linesize);
             }
         }
         break;
@@ -3611,7 +3611,10 @@ static int vc1_decode_p_mb(VC1Context *v)
                     if (v->rangeredfrm)
                         for (j = 0; j < 64; j++)
                             s->block[i][j] <<= 1;
-                    s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
+                    s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                      s->dest[dst_idx] + off,
+                                                      i & 4 ? s->uvlinesize
+                                                            : s->linesize);
                     if (v->pq >= 9 && v->overlap) {
                         if (v->c_avail)
                             v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
@@ -3719,8 +3722,10 @@ static int vc1_decode_p_mb(VC1Context *v)
                     if (v->rangeredfrm)
                         for (j = 0; j < 64; j++)
                             s->block[i][j] <<= 1;
-                    s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off,
-                                                     (i & 4) ? s->uvlinesize : s->linesize);
+                    s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                      s->dest[dst_idx] + off,
+                                                      (i & 4) ? s->uvlinesize
+                                                              : s->linesize);
                     if (v->pq >= 9 && v->overlap) {
                         if (v->c_avail)
                             v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
@@ -3869,7 +3874,9 @@ static int vc1_decode_p_mb_intfr(VC1Context *v)
                     stride_y = s->uvlinesize;
                     off = 0;
                 }
-                s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, stride_y);
+                s->idsp.put_signed_pixels_clamped(s->block[i],
+                                                  s->dest[dst_idx] + off,
+                                                  stride_y);
                 //TODO: loop filter
             }
 
@@ -4031,7 +4038,10 @@ static int vc1_decode_p_mb_intfi(VC1Context *v)
                 continue;
             v->vc1dsp.vc1_inv_trans_8x8(s->block[i]);
             off  = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
-            s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, (i & 4) ? s->uvlinesize : s->linesize);
+            s->idsp.put_signed_pixels_clamped(s->block[i],
+                                              s->dest[dst_idx] + off,
+                                              (i & 4) ? s->uvlinesize
+                                                      : s->linesize);
             // TODO: loop filter
         }
     } else {
@@ -4233,7 +4243,10 @@ static void vc1_decode_b_mb(VC1Context *v)
             if (v->rangeredfrm)
                 for (j = 0; j < 64; j++)
                     s->block[i][j] <<= 1;
-            s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize);
+            s->idsp.put_signed_pixels_clamped(s->block[i],
+                                              s->dest[dst_idx] + off,
+                                              i & 4 ? s->uvlinesize
+                                                    : s->linesize);
         } else if (val) {
             vc1_decode_p_block(v, s->block[i], i, mquant, ttmb,
                                first_block, s->dest[dst_idx] + off,
@@ -4305,7 +4318,10 @@ static void vc1_decode_b_mb_intfi(VC1Context *v)
                 for (j = 0; j < 64; j++)
                     s->block[i][j] <<= 1;
             off  = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize);
-            s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, (i & 4) ? s->uvlinesize : s->linesize);
+            s->idsp.put_signed_pixels_clamped(s->block[i],
+                                              s->dest[dst_idx] + off,
+                                              (i & 4) ? s->uvlinesize
+                                                      : s->linesize);
             // TODO: yet to perform loop filter
         }
     } else {
@@ -4524,7 +4540,9 @@ static int vc1_decode_b_mb_intfr(VC1Context *v)
                 stride_y = s->uvlinesize;
                 off = 0;
             }
-            s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, stride_y);
+            s->idsp.put_signed_pixels_clamped(s->block[i],
+                                              s->dest[dst_idx] + off,
+                                              stride_y);
         }
     } else {
         s->mb_intra = v->is_intra[s->mb_x] = 0;
@@ -4828,12 +4846,16 @@ static void vc1_decode_i_blocks(VC1Context *v)
                     if (v->rangeredfrm)
                         for (j = 0; j < 64; j++)
                             s->block[k][j] <<= 1;
-                    s->dsp.put_signed_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize);
+                    s->idsp.put_signed_pixels_clamped(s->block[k], dst[k],
+                                                      k & 4 ? s->uvlinesize
+                                                            : s->linesize);
                 } else {
                     if (v->rangeredfrm)
                         for (j = 0; j < 64; j++)
                             s->block[k][j] = (s->block[k][j] - 64) << 1;
-                    s->dsp.put_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize);
+                    s->idsp.put_pixels_clamped(s->block[k], dst[k],
+                                               k & 4 ? s->uvlinesize
+                                                     : s->linesize);
                 }
             }
 
diff --git a/libavcodec/wmv2.c b/libavcodec/wmv2.c
index bd799d0e8b..b6c7bc0a9f 100644
--- a/libavcodec/wmv2.c
+++ b/libavcodec/wmv2.c
@@ -19,6 +19,7 @@
  */
 
 #include "avcodec.h"
+#include "idctdsp.h"
 #include "mpegvideo.h"
 #include "msmpeg4data.h"
 #include "simple_idct.h"
@@ -30,24 +31,24 @@ av_cold void ff_wmv2_common_init(Wmv2Context * w){
 
     ff_blockdsp_init(&s->bdsp, s->avctx);
     ff_wmv2dsp_init(&w->wdsp);
-    s->dsp.idct_permutation_type = w->wdsp.idct_perm;
-    ff_init_scantable_permutation(s->dsp.idct_permutation,
+    s->idsp.idct_permutation_type = w->wdsp.idct_perm;
+    ff_init_scantable_permutation(s->idsp.idct_permutation,
                                   w->wdsp.idct_perm);
-    ff_init_scantable(s->dsp.idct_permutation, &w->abt_scantable[0],
+    ff_init_scantable(s->idsp.idct_permutation, &w->abt_scantable[0],
                       ff_wmv2_scantableA);
-    ff_init_scantable(s->dsp.idct_permutation, &w->abt_scantable[1],
+    ff_init_scantable(s->idsp.idct_permutation, &w->abt_scantable[1],
                       ff_wmv2_scantableB);
-    ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable,
+    ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable,
                       ff_wmv1_scantable[1]);
-    ff_init_scantable(s->dsp.idct_permutation, &s->intra_h_scantable,
+    ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable,
                       ff_wmv1_scantable[2]);
-    ff_init_scantable(s->dsp.idct_permutation, &s->intra_v_scantable,
+    ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable,
                       ff_wmv1_scantable[3]);
-    ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable,
+    ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable,
                       ff_wmv1_scantable[0]);
-    s->dsp.idct_put = w->wdsp.idct_put;
-    s->dsp.idct_add = w->wdsp.idct_add;
-    s->dsp.idct     = NULL;
+    s->idsp.idct_put = w->wdsp.idct_put;
+    s->idsp.idct_add = w->wdsp.idct_add;
+    s->idsp.idct     = NULL;
 }
 
 static void wmv2_add_block(Wmv2Context *w, int16_t *block1, uint8_t *dst, int stride, int n){
diff --git a/libavcodec/wmv2dsp.c b/libavcodec/wmv2dsp.c
index dff49f47a4..49df43690a 100644
--- a/libavcodec/wmv2dsp.c
+++ b/libavcodec/wmv2dsp.c
@@ -19,7 +19,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 #include "avcodec.h"
-#include "dsputil.h"
+#include "idctdsp.h"
 #include "mathops.h"
 #include "wmv2dsp.h"
 
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 13f9affdb2..14e58f9a9c 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -18,6 +18,7 @@ OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
 OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
+OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
@@ -49,13 +50,14 @@ OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
 
 MMX-OBJS-$(CONFIG_AUDIODSP)            += x86/audiodsp_mmx.o
 MMX-OBJS-$(CONFIG_BLOCKDSP)            += x86/blockdsp_mmx.o
-MMX-OBJS-$(CONFIG_DSPUTIL)             += x86/dsputil_mmx.o             \
-                                          x86/idct_mmx_xvid.o           \
-                                          x86/idct_sse2_xvid.o          \
-                                          x86/simple_idct.o
+MMX-OBJS-$(CONFIG_DSPUTIL)             += x86/dsputil_mmx.o
 MMX-OBJS-$(CONFIG_HPELDSP)             += x86/fpel_mmx.o                \
                                           x86/hpeldsp_mmx.o
 MMX-OBJS-$(CONFIG_HUFFYUVDSP)          += x86/huffyuvdsp_mmx.o
+MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/idctdsp_mmx.o             \
+                                          x86/idct_mmx_xvid.o           \
+                                          x86/idct_sse2_xvid.o          \
+                                          x86/simple_idct.o
 MMX-OBJS-$(CONFIG_QPELDSP)             += x86/fpel_mmx.o
 
 MMX-OBJS-$(CONFIG_SVQ1_ENCODER)        += x86/svq1enc_mmx.o
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index d5c441f1e5..f0e8cfcd17 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -28,9 +28,10 @@
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/cavsdsp.h"
+#include "libavcodec/idctdsp.h"
 #include "constants.h"
-#include "dsputil_x86.h"
 #include "fpel.h"
+#include "idctdsp.h"
 #include "config.h"
 
 #if HAVE_MMX_INLINE
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 74dab48e72..adc7aa95d6 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -22,97 +22,18 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
-#include "libavcodec/simple_idct.h"
 #include "dsputil_x86.h"
-#include "idct_xvid.h"
-
-/* Input permutation for the simple_idct_mmx */
-static const uint8_t simple_mmx_permutation[64] = {
-    0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
-    0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
-    0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
-    0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
-    0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
-    0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
-    0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
-    0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
-};
-
-static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
-
-av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
-                                              int idct_permutation_type)
-{
-    int i;
-
-    switch (idct_permutation_type) {
-    case FF_SIMPLE_IDCT_PERM:
-        for (i = 0; i < 64; i++)
-            idct_permutation[i] = simple_mmx_permutation[i];
-        return 1;
-    case FF_SSE2_IDCT_PERM:
-        for (i = 0; i < 64; i++)
-            idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
-        return 1;
-    }
-
-    return 0;
-}
 
 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
                                      int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_MMX_INLINE
-    c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
-    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
-    c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
-
     if (!high_bit_depth) {
         c->draw_edges   = ff_draw_edges_mmx;
-
-        switch (avctx->idct_algo) {
-        case FF_IDCT_AUTO:
-        case FF_IDCT_SIMPLEMMX:
-            c->idct_put              = ff_simple_idct_put_mmx;
-            c->idct_add              = ff_simple_idct_add_mmx;
-            c->idct                  = ff_simple_idct_mmx;
-            c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
-            break;
-        case FF_IDCT_XVIDMMX:
-            c->idct_put              = ff_idct_xvid_mmx_put;
-            c->idct_add              = ff_idct_xvid_mmx_add;
-            c->idct                  = ff_idct_xvid_mmx;
-            break;
-        }
     }
 #endif /* HAVE_MMX_INLINE */
 }
 
-static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
-                                        int cpu_flags, unsigned high_bit_depth)
-{
-#if HAVE_MMXEXT_INLINE
-    if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
-        c->idct_put = ff_idct_xvid_mmxext_put;
-        c->idct_add = ff_idct_xvid_mmxext_add;
-        c->idct     = ff_idct_xvid_mmxext;
-    }
-#endif /* HAVE_MMXEXT_INLINE */
-}
-
-static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
-                                      int cpu_flags, unsigned high_bit_depth)
-{
-#if HAVE_SSE2_INLINE
-    if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
-        c->idct_put              = ff_idct_xvid_sse2_put;
-        c->idct_add              = ff_idct_xvid_sse2_add;
-        c->idct                  = ff_idct_xvid_sse2;
-        c->idct_permutation_type = FF_SSE2_IDCT_PERM;
-    }
-#endif /* HAVE_SSE2_INLINE */
-}
-
 av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
@@ -121,12 +42,6 @@ av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
     if (X86_MMX(cpu_flags))
         dsputil_init_mmx(c, avctx, cpu_flags, high_bit_depth);
 
-    if (X86_MMXEXT(cpu_flags))
-        dsputil_init_mmxext(c, avctx, cpu_flags, high_bit_depth);
-
-    if (X86_SSE2(cpu_flags))
-        dsputil_init_sse2(c, avctx, cpu_flags, high_bit_depth);
-
     if (CONFIG_ENCODERS)
         ff_dsputilenc_init_mmx(c, avctx, high_bit_depth);
 }
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 5fa047da7b..d205a48ea4 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -30,141 +30,6 @@
 
 #if HAVE_INLINE_ASM
 
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    /* unrolled loop */
-    __asm__ volatile (
-        "movq      (%3), %%mm0          \n\t"
-        "movq     8(%3), %%mm1          \n\t"
-        "movq    16(%3), %%mm2          \n\t"
-        "movq    24(%3), %%mm3          \n\t"
-        "movq    32(%3), %%mm4          \n\t"
-        "movq    40(%3), %%mm5          \n\t"
-        "movq    48(%3), %%mm6          \n\t"
-        "movq    56(%3), %%mm7          \n\t"
-        "packuswb %%mm1, %%mm0          \n\t"
-        "packuswb %%mm3, %%mm2          \n\t"
-        "packuswb %%mm5, %%mm4          \n\t"
-        "packuswb %%mm7, %%mm6          \n\t"
-        "movq     %%mm0, (%0)           \n\t"
-        "movq     %%mm2, (%0, %1)       \n\t"
-        "movq     %%mm4, (%0, %1, 2)    \n\t"
-        "movq     %%mm6, (%0, %2)       \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-    pix += line_size * 4;
-    p   += 32;
-
-    // if here would be an exact copy of the code above
-    // compiler would generate some very strange code
-    // thus using "r"
-    __asm__ volatile (
-        "movq       (%3), %%mm0         \n\t"
-        "movq      8(%3), %%mm1         \n\t"
-        "movq     16(%3), %%mm2         \n\t"
-        "movq     24(%3), %%mm3         \n\t"
-        "movq     32(%3), %%mm4         \n\t"
-        "movq     40(%3), %%mm5         \n\t"
-        "movq     48(%3), %%mm6         \n\t"
-        "movq     56(%3), %%mm7         \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "packuswb  %%mm3, %%mm2         \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "packuswb  %%mm7, %%mm6         \n\t"
-        "movq      %%mm0, (%0)          \n\t"
-        "movq      %%mm2, (%0, %1)      \n\t"
-        "movq      %%mm4, (%0, %1, 2)   \n\t"
-        "movq      %%mm6, (%0, %2)      \n\t"
-        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
-           "r" (p)
-        : "memory");
-}
-
-#define put_signed_pixels_clamped_mmx_half(off)             \
-    "movq          "#off"(%2), %%mm1        \n\t"           \
-    "movq     16 + "#off"(%2), %%mm2        \n\t"           \
-    "movq     32 + "#off"(%2), %%mm3        \n\t"           \
-    "movq     48 + "#off"(%2), %%mm4        \n\t"           \
-    "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
-    "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
-    "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
-    "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
-    "paddb              %%mm0, %%mm1        \n\t"           \
-    "paddb              %%mm0, %%mm2        \n\t"           \
-    "paddb              %%mm0, %%mm3        \n\t"           \
-    "paddb              %%mm0, %%mm4        \n\t"           \
-    "movq               %%mm1, (%0)         \n\t"           \
-    "movq               %%mm2, (%0, %3)     \n\t"           \
-    "movq               %%mm3, (%0, %3, 2)  \n\t"           \
-    "movq               %%mm4, (%0, %1)     \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      int line_size)
-{
-    x86_reg line_skip = line_size;
-    x86_reg line_skip3;
-
-    __asm__ volatile (
-        "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
-        "lea         (%3, %3, 2), %1        \n\t"
-        put_signed_pixels_clamped_mmx_half(0)
-        "lea         (%0, %3, 4), %0        \n\t"
-        put_signed_pixels_clamped_mmx_half(64)
-        : "+&r" (pixels), "=&r" (line_skip3)
-        : "r" (block), "r" (line_skip)
-        : "memory");
-}
-
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size)
-{
-    const int16_t *p;
-    uint8_t *pix;
-    int i;
-
-    /* read the pixels */
-    p   = block;
-    pix = pixels;
-    MOVQ_ZERO(mm7);
-    i = 4;
-    do {
-        __asm__ volatile (
-            "movq        (%2), %%mm0    \n\t"
-            "movq       8(%2), %%mm1    \n\t"
-            "movq      16(%2), %%mm2    \n\t"
-            "movq      24(%2), %%mm3    \n\t"
-            "movq          %0, %%mm4    \n\t"
-            "movq          %1, %%mm6    \n\t"
-            "movq       %%mm4, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm4    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm4, %%mm0    \n\t"
-            "paddsw     %%mm5, %%mm1    \n\t"
-            "movq       %%mm6, %%mm5    \n\t"
-            "punpcklbw  %%mm7, %%mm6    \n\t"
-            "punpckhbw  %%mm7, %%mm5    \n\t"
-            "paddsw     %%mm6, %%mm2    \n\t"
-            "paddsw     %%mm5, %%mm3    \n\t"
-            "packuswb   %%mm1, %%mm0    \n\t"
-            "packuswb   %%mm3, %%mm2    \n\t"
-            "movq       %%mm0, %0       \n\t"
-            "movq       %%mm2, %1       \n\t"
-            : "+m" (*pix), "+m" (*(pix + line_size))
-            : "r" (p)
-            : "memory");
-        pix += line_size * 2;
-        p   += 16;
-    } while (--i);
-}
-
 /* Draw the edges of width 'w' of an image of size width, height
  * this MMX version can only handle w == 8 || w == 16. */
 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 4beb6c11ca..7e1e8af051 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -31,13 +31,6 @@ void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
                             unsigned high_bit_depth);
 void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx);
 
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size);
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                               int line_size);
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
-                                      int line_size);
-
 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
                        int w, int h, int sides);
 
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
index 27723393bf..920ea4c0dc 100644
--- a/libavcodec/x86/idct_mmx_xvid.c
+++ b/libavcodec/x86/idct_mmx_xvid.c
@@ -44,8 +44,8 @@
 #include "config.h"
 #include "libavcodec/avcodec.h"
 #include "libavutil/mem.h"
-#include "dsputil_x86.h"
 #include "idct_xvid.h"
+#include "idctdsp.h"
 
 #if HAVE_MMX_INLINE
 
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index 50655d6bc0..aadeb122c6 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -42,7 +42,7 @@
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
 #include "idct_xvid.h"
-#include "dsputil_x86.h"
+#include "idctdsp.h"
 
 #if HAVE_SSE2_INLINE
 
diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h
new file mode 100644
index 0000000000..22df3dd758
--- /dev/null
+++ b/libavcodec/x86/idctdsp.h
@@ -0,0 +1,31 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_IDCTDSP_H
+#define AVCODEC_X86_IDCTDSP_H
+
+#include <stdint.h>
+
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               int line_size);
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               int line_size);
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                                      int line_size);
+
+#endif /* AVCODEC_X86_IDCTDSP_H */
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
new file mode 100644
index 0000000000..9b68497502
--- /dev/null
+++ b/libavcodec/x86/idctdsp_init.c
@@ -0,0 +1,106 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "libavcodec/simple_idct.h"
+#include "idct_xvid.h"
+#include "idctdsp.h"
+
+/* Input permutation for the simple_idct_mmx */
+static const uint8_t simple_mmx_permutation[64] = {
+    0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
+    0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
+    0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
+    0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
+    0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
+    0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
+    0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
+    0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
+};
+
+static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
+                                              int idct_permutation_type)
+{
+    int i;
+
+    switch (idct_permutation_type) {
+    case FF_SIMPLE_IDCT_PERM:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = simple_mmx_permutation[i];
+        return 1;
+    case FF_SSE2_IDCT_PERM:
+        for (i = 0; i < 64; i++)
+            idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
+        return 1;
+    }
+
+    return 0;
+}
+
+av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
+                                 unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (INLINE_MMX(cpu_flags)) {
+        c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
+        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+        c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
+
+        if (!high_bit_depth) {
+            switch (avctx->idct_algo) {
+            case FF_IDCT_AUTO:
+            case FF_IDCT_SIMPLEMMX:
+                c->idct_put              = ff_simple_idct_put_mmx;
+                c->idct_add              = ff_simple_idct_add_mmx;
+                c->idct                  = ff_simple_idct_mmx;
+                c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
+                break;
+            case FF_IDCT_XVIDMMX:
+                c->idct_put              = ff_idct_xvid_mmx_put;
+                c->idct_add              = ff_idct_xvid_mmx_add;
+                c->idct                  = ff_idct_xvid_mmx;
+                break;
+            }
+        }
+    }
+
+    if (INLINE_MMXEXT(cpu_flags)) {
+        if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
+            c->idct_put = ff_idct_xvid_mmxext_put;
+            c->idct_add = ff_idct_xvid_mmxext_add;
+            c->idct     = ff_idct_xvid_mmxext;
+        }
+    }
+
+    if (INLINE_SSE2(cpu_flags)) {
+        if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
+            c->idct_put              = ff_idct_xvid_sse2_put;
+            c->idct_add              = ff_idct_xvid_sse2_add;
+            c->idct                  = ff_idct_xvid_sse2;
+            c->idct_permutation_type = FF_SSE2_IDCT_PERM;
+        }
+    }
+}
diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c
new file mode 100644
index 0000000000..7285b1d357
--- /dev/null
+++ b/libavcodec/x86/idctdsp_mmx.c
@@ -0,0 +1,168 @@
+/*
+ * SIMD-optimized IDCT-related routines
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "idctdsp.h"
+#include "inline_asm.h"
+
+#if HAVE_INLINE_ASM
+
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               int line_size)
+{
+    const int16_t *p;
+    uint8_t *pix;
+
+    /* read the pixels */
+    p   = block;
+    pix = pixels;
+    /* unrolled loop */
+    __asm__ volatile (
+        "movq      (%3), %%mm0          \n\t"
+        "movq     8(%3), %%mm1          \n\t"
+        "movq    16(%3), %%mm2          \n\t"
+        "movq    24(%3), %%mm3          \n\t"
+        "movq    32(%3), %%mm4          \n\t"
+        "movq    40(%3), %%mm5          \n\t"
+        "movq    48(%3), %%mm6          \n\t"
+        "movq    56(%3), %%mm7          \n\t"
+        "packuswb %%mm1, %%mm0          \n\t"
+        "packuswb %%mm3, %%mm2          \n\t"
+        "packuswb %%mm5, %%mm4          \n\t"
+        "packuswb %%mm7, %%mm6          \n\t"
+        "movq     %%mm0, (%0)           \n\t"
+        "movq     %%mm2, (%0, %1)       \n\t"
+        "movq     %%mm4, (%0, %1, 2)    \n\t"
+        "movq     %%mm6, (%0, %2)       \n\t"
+        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
+           "r" (p)
+        : "memory");
+    pix += line_size * 4;
+    p   += 32;
+
+    // if here would be an exact copy of the code above
+    // compiler would generate some very strange code
+    // thus using "r"
+    __asm__ volatile (
+        "movq       (%3), %%mm0         \n\t"
+        "movq      8(%3), %%mm1         \n\t"
+        "movq     16(%3), %%mm2         \n\t"
+        "movq     24(%3), %%mm3         \n\t"
+        "movq     32(%3), %%mm4         \n\t"
+        "movq     40(%3), %%mm5         \n\t"
+        "movq     48(%3), %%mm6         \n\t"
+        "movq     56(%3), %%mm7         \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+        "packuswb  %%mm3, %%mm2         \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+        "packuswb  %%mm7, %%mm6         \n\t"
+        "movq      %%mm0, (%0)          \n\t"
+        "movq      %%mm2, (%0, %1)      \n\t"
+        "movq      %%mm4, (%0, %1, 2)   \n\t"
+        "movq      %%mm6, (%0, %2)      \n\t"
+        :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
+           "r" (p)
+        : "memory");
+}
+
+#define put_signed_pixels_clamped_mmx_half(off)             \
+    "movq          "#off"(%2), %%mm1        \n\t"           \
+    "movq     16 + "#off"(%2), %%mm2        \n\t"           \
+    "movq     32 + "#off"(%2), %%mm3        \n\t"           \
+    "movq     48 + "#off"(%2), %%mm4        \n\t"           \
+    "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
+    "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
+    "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
+    "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
+    "paddb              %%mm0, %%mm1        \n\t"           \
+    "paddb              %%mm0, %%mm2        \n\t"           \
+    "paddb              %%mm0, %%mm3        \n\t"           \
+    "paddb              %%mm0, %%mm4        \n\t"           \
+    "movq               %%mm1, (%0)         \n\t"           \
+    "movq               %%mm2, (%0, %3)     \n\t"           \
+    "movq               %%mm3, (%0, %3, 2)  \n\t"           \
+    "movq               %%mm4, (%0, %1)     \n\t"
+
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                                      int line_size)
+{
+    x86_reg line_skip = line_size;
+    x86_reg line_skip3;
+
+    __asm__ volatile (
+        "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
+        "lea         (%3, %3, 2), %1        \n\t"
+        put_signed_pixels_clamped_mmx_half(0)
+        "lea         (%0, %3, 4), %0        \n\t"
+        put_signed_pixels_clamped_mmx_half(64)
+        : "+&r" (pixels), "=&r" (line_skip3)
+        : "r" (block), "r" (line_skip)
+        : "memory");
+}
+
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+                               int line_size)
+{
+    const int16_t *p;
+    uint8_t *pix;
+    int i;
+
+    /* read the pixels */
+    p   = block;
+    pix = pixels;
+    MOVQ_ZERO(mm7);
+    i = 4;
+    do {
+        __asm__ volatile (
+            "movq        (%2), %%mm0    \n\t"
+            "movq       8(%2), %%mm1    \n\t"
+            "movq      16(%2), %%mm2    \n\t"
+            "movq      24(%2), %%mm3    \n\t"
+            "movq          %0, %%mm4    \n\t"
+            "movq          %1, %%mm6    \n\t"
+            "movq       %%mm4, %%mm5    \n\t"
+            "punpcklbw  %%mm7, %%mm4    \n\t"
+            "punpckhbw  %%mm7, %%mm5    \n\t"
+            "paddsw     %%mm4, %%mm0    \n\t"
+            "paddsw     %%mm5, %%mm1    \n\t"
+            "movq       %%mm6, %%mm5    \n\t"
+            "punpcklbw  %%mm7, %%mm6    \n\t"
+            "punpckhbw  %%mm7, %%mm5    \n\t"
+            "paddsw     %%mm6, %%mm2    \n\t"
+            "paddsw     %%mm5, %%mm3    \n\t"
+            "packuswb   %%mm1, %%mm0    \n\t"
+            "packuswb   %%mm3, %%mm2    \n\t"
+            "movq       %%mm0, %0       \n\t"
+            "movq       %%mm2, %1       \n\t"
+            : "+m" (*pix), "+m" (*(pix + line_size))
+            : "r" (p)
+            : "memory");
+        pix += line_size * 2;
+        p   += 16;
+    } while (--i);
+}
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index d01ff1c0f8..fa590066d6 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -229,7 +229,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     if(s->mb_intra) block[0]= level;
     else            block[0]= temp_block[0];
 
-    if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
+    if (s->idsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM) {
         if(last_non_zero_p1 <= 1) goto end;
         block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
         block[0x20] = temp_block[0x10];
diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c
index 68ad929067..a66fc70982 100644
--- a/libavcodec/x86/proresdsp_init.c
+++ b/libavcodec/x86/proresdsp_init.c
@@ -22,7 +22,7 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/dsputil.h"
+#include "libavcodec/idctdsp.h"
 #include "libavcodec/proresdsp.h"
 
 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index a342110cd3..bbe5a67472 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -23,7 +23,7 @@
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/x86/asm.h"
-#include "dsputil_x86.h"
+#include "idctdsp.h"
 
 #if HAVE_INLINE_ASM
 
-- 
cgit v1.2.3