summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorDiego Biurrun <diego@biurrun.de>2014-01-24 11:55:16 +0100
committerDiego Biurrun <diego@biurrun.de>2014-06-30 07:58:46 -0700
commite3fcb14347466095839c2a3c47ebecff02da891e (patch)
tree38fbcef2c592faae3610887dbda3ab333181d1dc /libavcodec/x86
parentadcb8392c9b185fd8a91a95fa256d15ab1432a30 (diff)
dsputil: Split off IDCT bits into their own context
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/Makefile10
-rw-r--r--libavcodec/x86/cavsdsp.c3
-rw-r--r--libavcodec/x86/dsputil_init.c85
-rw-r--r--libavcodec/x86/dsputil_mmx.c135
-rw-r--r--libavcodec/x86/dsputil_x86.h7
-rw-r--r--libavcodec/x86/idct_mmx_xvid.c2
-rw-r--r--libavcodec/x86/idct_sse2_xvid.c2
-rw-r--r--libavcodec/x86/idctdsp.h31
-rw-r--r--libavcodec/x86/idctdsp_init.c106
-rw-r--r--libavcodec/x86/idctdsp_mmx.c168
-rw-r--r--libavcodec/x86/mpegvideoenc_template.c2
-rw-r--r--libavcodec/x86/proresdsp_init.c2
-rw-r--r--libavcodec/x86/simple_idct.c2
13 files changed, 318 insertions, 237 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 13f9affdb2..14e58f9a9c 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -18,6 +18,7 @@ OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o
OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_mmx.o
+OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \
@@ -49,13 +50,14 @@ OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o
MMX-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_mmx.o
MMX-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_mmx.o
-MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \
- x86/idct_mmx_xvid.o \
- x86/idct_sse2_xvid.o \
- x86/simple_idct.o
+MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o
MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \
x86/hpeldsp_mmx.o
MMX-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_mmx.o
+MMX-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_mmx.o \
+ x86/idct_mmx_xvid.o \
+ x86/idct_sse2_xvid.o \
+ x86/simple_idct.o
MMX-OBJS-$(CONFIG_QPELDSP) += x86/fpel_mmx.o
MMX-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_mmx.o
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index d5c441f1e5..f0e8cfcd17 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -28,9 +28,10 @@
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/cavsdsp.h"
+#include "libavcodec/idctdsp.h"
#include "constants.h"
-#include "dsputil_x86.h"
#include "fpel.h"
+#include "idctdsp.h"
#include "config.h"
#if HAVE_MMX_INLINE
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 74dab48e72..adc7aa95d6 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -22,97 +22,18 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
-#include "libavcodec/simple_idct.h"
#include "dsputil_x86.h"
-#include "idct_xvid.h"
-
-/* Input permutation for the simple_idct_mmx */
-static const uint8_t simple_mmx_permutation[64] = {
- 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
- 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
- 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
- 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
- 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
- 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
- 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
- 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
-};
-
-static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
-
-av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
- int idct_permutation_type)
-{
- int i;
-
- switch (idct_permutation_type) {
- case FF_SIMPLE_IDCT_PERM:
- for (i = 0; i < 64; i++)
- idct_permutation[i] = simple_mmx_permutation[i];
- return 1;
- case FF_SSE2_IDCT_PERM:
- for (i = 0; i < 64; i++)
- idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
- return 1;
- }
-
- return 0;
-}
static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
int cpu_flags, unsigned high_bit_depth)
{
#if HAVE_MMX_INLINE
- c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
- c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
- c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
-
if (!high_bit_depth) {
c->draw_edges = ff_draw_edges_mmx;
-
- switch (avctx->idct_algo) {
- case FF_IDCT_AUTO:
- case FF_IDCT_SIMPLEMMX:
- c->idct_put = ff_simple_idct_put_mmx;
- c->idct_add = ff_simple_idct_add_mmx;
- c->idct = ff_simple_idct_mmx;
- c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
- break;
- case FF_IDCT_XVIDMMX:
- c->idct_put = ff_idct_xvid_mmx_put;
- c->idct_add = ff_idct_xvid_mmx_add;
- c->idct = ff_idct_xvid_mmx;
- break;
- }
}
#endif /* HAVE_MMX_INLINE */
}
-static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
- int cpu_flags, unsigned high_bit_depth)
-{
-#if HAVE_MMXEXT_INLINE
- if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
- c->idct_put = ff_idct_xvid_mmxext_put;
- c->idct_add = ff_idct_xvid_mmxext_add;
- c->idct = ff_idct_xvid_mmxext;
- }
-#endif /* HAVE_MMXEXT_INLINE */
-}
-
-static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
- int cpu_flags, unsigned high_bit_depth)
-{
-#if HAVE_SSE2_INLINE
- if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
- c->idct_put = ff_idct_xvid_sse2_put;
- c->idct_add = ff_idct_xvid_sse2_add;
- c->idct = ff_idct_xvid_sse2;
- c->idct_permutation_type = FF_SSE2_IDCT_PERM;
- }
-#endif /* HAVE_SSE2_INLINE */
-}
-
av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
@@ -121,12 +42,6 @@ av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
if (X86_MMX(cpu_flags))
dsputil_init_mmx(c, avctx, cpu_flags, high_bit_depth);
- if (X86_MMXEXT(cpu_flags))
- dsputil_init_mmxext(c, avctx, cpu_flags, high_bit_depth);
-
- if (X86_SSE2(cpu_flags))
- dsputil_init_sse2(c, avctx, cpu_flags, high_bit_depth);
-
if (CONFIG_ENCODERS)
ff_dsputilenc_init_mmx(c, avctx, high_bit_depth);
}
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 5fa047da7b..d205a48ea4 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -30,141 +30,6 @@
#if HAVE_INLINE_ASM
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size)
-{
- const int16_t *p;
- uint8_t *pix;
-
- /* read the pixels */
- p = block;
- pix = pixels;
- /* unrolled loop */
- __asm__ volatile (
- "movq (%3), %%mm0 \n\t"
- "movq 8(%3), %%mm1 \n\t"
- "movq 16(%3), %%mm2 \n\t"
- "movq 24(%3), %%mm3 \n\t"
- "movq 32(%3), %%mm4 \n\t"
- "movq 40(%3), %%mm5 \n\t"
- "movq 48(%3), %%mm6 \n\t"
- "movq 56(%3), %%mm7 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "packuswb %%mm7, %%mm6 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm2, (%0, %1) \n\t"
- "movq %%mm4, (%0, %1, 2) \n\t"
- "movq %%mm6, (%0, %2) \n\t"
- :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
- "r" (p)
- : "memory");
- pix += line_size * 4;
- p += 32;
-
- // if here would be an exact copy of the code above
- // compiler would generate some very strange code
- // thus using "r"
- __asm__ volatile (
- "movq (%3), %%mm0 \n\t"
- "movq 8(%3), %%mm1 \n\t"
- "movq 16(%3), %%mm2 \n\t"
- "movq 24(%3), %%mm3 \n\t"
- "movq 32(%3), %%mm4 \n\t"
- "movq 40(%3), %%mm5 \n\t"
- "movq 48(%3), %%mm6 \n\t"
- "movq 56(%3), %%mm7 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "packuswb %%mm7, %%mm6 \n\t"
- "movq %%mm0, (%0) \n\t"
- "movq %%mm2, (%0, %1) \n\t"
- "movq %%mm4, (%0, %1, 2) \n\t"
- "movq %%mm6, (%0, %2) \n\t"
- :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
- "r" (p)
- : "memory");
-}
-
-#define put_signed_pixels_clamped_mmx_half(off) \
- "movq "#off"(%2), %%mm1 \n\t" \
- "movq 16 + "#off"(%2), %%mm2 \n\t" \
- "movq 32 + "#off"(%2), %%mm3 \n\t" \
- "movq 48 + "#off"(%2), %%mm4 \n\t" \
- "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
- "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
- "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
- "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
- "paddb %%mm0, %%mm1 \n\t" \
- "paddb %%mm0, %%mm2 \n\t" \
- "paddb %%mm0, %%mm3 \n\t" \
- "paddb %%mm0, %%mm4 \n\t" \
- "movq %%mm1, (%0) \n\t" \
- "movq %%mm2, (%0, %3) \n\t" \
- "movq %%mm3, (%0, %3, 2) \n\t" \
- "movq %%mm4, (%0, %1) \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size)
-{
- x86_reg line_skip = line_size;
- x86_reg line_skip3;
-
- __asm__ volatile (
- "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
- "lea (%3, %3, 2), %1 \n\t"
- put_signed_pixels_clamped_mmx_half(0)
- "lea (%0, %3, 4), %0 \n\t"
- put_signed_pixels_clamped_mmx_half(64)
- : "+&r" (pixels), "=&r" (line_skip3)
- : "r" (block), "r" (line_skip)
- : "memory");
-}
-
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size)
-{
- const int16_t *p;
- uint8_t *pix;
- int i;
-
- /* read the pixels */
- p = block;
- pix = pixels;
- MOVQ_ZERO(mm7);
- i = 4;
- do {
- __asm__ volatile (
- "movq (%2), %%mm0 \n\t"
- "movq 8(%2), %%mm1 \n\t"
- "movq 16(%2), %%mm2 \n\t"
- "movq 24(%2), %%mm3 \n\t"
- "movq %0, %%mm4 \n\t"
- "movq %1, %%mm6 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddsw %%mm4, %%mm0 \n\t"
- "paddsw %%mm5, %%mm1 \n\t"
- "movq %%mm6, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm6 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddsw %%mm6, %%mm2 \n\t"
- "paddsw %%mm5, %%mm3 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "packuswb %%mm3, %%mm2 \n\t"
- "movq %%mm0, %0 \n\t"
- "movq %%mm2, %1 \n\t"
- : "+m" (*pix), "+m" (*(pix + line_size))
- : "r" (p)
- : "memory");
- pix += line_size * 2;
- p += 16;
- } while (--i);
-}
-
/* Draw the edges of width 'w' of an image of size width, height
* this MMX version can only handle w == 8 || w == 16. */
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 4beb6c11ca..7e1e8af051 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -31,13 +31,6 @@ void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth);
void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx);
-void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size);
-void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size);
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size);
-
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
int w, int h, int sides);
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
index 27723393bf..920ea4c0dc 100644
--- a/libavcodec/x86/idct_mmx_xvid.c
+++ b/libavcodec/x86/idct_mmx_xvid.c
@@ -44,8 +44,8 @@
#include "config.h"
#include "libavcodec/avcodec.h"
#include "libavutil/mem.h"
-#include "dsputil_x86.h"
#include "idct_xvid.h"
+#include "idctdsp.h"
#if HAVE_MMX_INLINE
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index 50655d6bc0..aadeb122c6 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -42,7 +42,7 @@
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "idct_xvid.h"
-#include "dsputil_x86.h"
+#include "idctdsp.h"
#if HAVE_SSE2_INLINE
diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h
new file mode 100644
index 0000000000..22df3dd758
--- /dev/null
+++ b/libavcodec/x86/idctdsp.h
@@ -0,0 +1,31 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_IDCTDSP_H
+#define AVCODEC_X86_IDCTDSP_H
+
+#include <stdint.h>
+
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+ int line_size);
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+ int line_size);
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+ int line_size);
+
+#endif /* AVCODEC_X86_IDCTDSP_H */
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
new file mode 100644
index 0000000000..9b68497502
--- /dev/null
+++ b/libavcodec/x86/idctdsp_init.c
@@ -0,0 +1,106 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "libavcodec/simple_idct.h"
+#include "idct_xvid.h"
+#include "idctdsp.h"
+
+/* Input permutation for the simple_idct_mmx */
+static const uint8_t simple_mmx_permutation[64] = {
+ 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
+ 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
+ 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
+ 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
+ 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
+ 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
+ 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
+ 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
+};
+
+static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation,
+ int idct_permutation_type)
+{
+ int i;
+
+ switch (idct_permutation_type) {
+ case FF_SIMPLE_IDCT_PERM:
+ for (i = 0; i < 64; i++)
+ idct_permutation[i] = simple_mmx_permutation[i];
+ return 1;
+ case FF_SSE2_IDCT_PERM:
+ for (i = 0; i < 64; i++)
+ idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
+ return 1;
+ }
+
+ return 0;
+}
+
+av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
+ unsigned high_bit_depth)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (INLINE_MMX(cpu_flags)) {
+ c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+ c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
+
+ if (!high_bit_depth) {
+ switch (avctx->idct_algo) {
+ case FF_IDCT_AUTO:
+ case FF_IDCT_SIMPLEMMX:
+ c->idct_put = ff_simple_idct_put_mmx;
+ c->idct_add = ff_simple_idct_add_mmx;
+ c->idct = ff_simple_idct_mmx;
+ c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
+ break;
+ case FF_IDCT_XVIDMMX:
+ c->idct_put = ff_idct_xvid_mmx_put;
+ c->idct_add = ff_idct_xvid_mmx_add;
+ c->idct = ff_idct_xvid_mmx;
+ break;
+ }
+ }
+ }
+
+ if (INLINE_MMXEXT(cpu_flags)) {
+ if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
+ c->idct_put = ff_idct_xvid_mmxext_put;
+ c->idct_add = ff_idct_xvid_mmxext_add;
+ c->idct = ff_idct_xvid_mmxext;
+ }
+ }
+
+ if (INLINE_SSE2(cpu_flags)) {
+ if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
+ c->idct_put = ff_idct_xvid_sse2_put;
+ c->idct_add = ff_idct_xvid_sse2_add;
+ c->idct = ff_idct_xvid_sse2;
+ c->idct_permutation_type = FF_SSE2_IDCT_PERM;
+ }
+ }
+}
diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c
new file mode 100644
index 0000000000..7285b1d357
--- /dev/null
+++ b/libavcodec/x86/idctdsp_mmx.c
@@ -0,0 +1,168 @@
+/*
+ * SIMD-optimized IDCT-related routines
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "idctdsp.h"
+#include "inline_asm.h"
+
+#if HAVE_INLINE_ASM
+
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+ int line_size)
+{
+ const int16_t *p;
+ uint8_t *pix;
+
+ /* read the pixels */
+ p = block;
+ pix = pixels;
+ /* unrolled loop */
+ __asm__ volatile (
+ "movq (%3), %%mm0 \n\t"
+ "movq 8(%3), %%mm1 \n\t"
+ "movq 16(%3), %%mm2 \n\t"
+ "movq 24(%3), %%mm3 \n\t"
+ "movq 32(%3), %%mm4 \n\t"
+ "movq 40(%3), %%mm5 \n\t"
+ "movq 48(%3), %%mm6 \n\t"
+ "movq 56(%3), %%mm7 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "packuswb %%mm7, %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, (%0, %1) \n\t"
+ "movq %%mm4, (%0, %1, 2) \n\t"
+ "movq %%mm6, (%0, %2) \n\t"
+ :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
+ "r" (p)
+ : "memory");
+ pix += line_size * 4;
+ p += 32;
+
+ // if here would be an exact copy of the code above
+ // compiler would generate some very strange code
+ // thus using "r"
+ __asm__ volatile (
+ "movq (%3), %%mm0 \n\t"
+ "movq 8(%3), %%mm1 \n\t"
+ "movq 16(%3), %%mm2 \n\t"
+ "movq 24(%3), %%mm3 \n\t"
+ "movq 32(%3), %%mm4 \n\t"
+ "movq 40(%3), %%mm5 \n\t"
+ "movq 48(%3), %%mm6 \n\t"
+ "movq 56(%3), %%mm7 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "packuswb %%mm7, %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, (%0, %1) \n\t"
+ "movq %%mm4, (%0, %1, 2) \n\t"
+ "movq %%mm6, (%0, %2) \n\t"
+ :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
+ "r" (p)
+ : "memory");
+}
+
+#define put_signed_pixels_clamped_mmx_half(off) \
+ "movq "#off"(%2), %%mm1 \n\t" \
+ "movq 16 + "#off"(%2), %%mm2 \n\t" \
+ "movq 32 + "#off"(%2), %%mm3 \n\t" \
+ "movq 48 + "#off"(%2), %%mm4 \n\t" \
+ "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
+ "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
+ "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
+ "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
+ "paddb %%mm0, %%mm1 \n\t" \
+ "paddb %%mm0, %%mm2 \n\t" \
+ "paddb %%mm0, %%mm3 \n\t" \
+ "paddb %%mm0, %%mm4 \n\t" \
+ "movq %%mm1, (%0) \n\t" \
+ "movq %%mm2, (%0, %3) \n\t" \
+ "movq %%mm3, (%0, %3, 2) \n\t" \
+ "movq %%mm4, (%0, %1) \n\t"
+
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+ int line_size)
+{
+ x86_reg line_skip = line_size;
+ x86_reg line_skip3;
+
+ __asm__ volatile (
+ "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
+ "lea (%3, %3, 2), %1 \n\t"
+ put_signed_pixels_clamped_mmx_half(0)
+ "lea (%0, %3, 4), %0 \n\t"
+ put_signed_pixels_clamped_mmx_half(64)
+ : "+&r" (pixels), "=&r" (line_skip3)
+ : "r" (block), "r" (line_skip)
+ : "memory");
+}
+
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
+ int line_size)
+{
+ const int16_t *p;
+ uint8_t *pix;
+ int i;
+
+ /* read the pixels */
+ p = block;
+ pix = pixels;
+ MOVQ_ZERO(mm7);
+ i = 4;
+ do {
+ __asm__ volatile (
+ "movq (%2), %%mm0 \n\t"
+ "movq 8(%2), %%mm1 \n\t"
+ "movq 16(%2), %%mm2 \n\t"
+ "movq 24(%2), %%mm3 \n\t"
+ "movq %0, %%mm4 \n\t"
+ "movq %1, %%mm6 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddsw %%mm4, %%mm0 \n\t"
+ "paddsw %%mm5, %%mm1 \n\t"
+ "movq %%mm6, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddsw %%mm6, %%mm2 \n\t"
+ "paddsw %%mm5, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %0 \n\t"
+ "movq %%mm2, %1 \n\t"
+ : "+m" (*pix), "+m" (*(pix + line_size))
+ : "r" (p)
+ : "memory");
+ pix += line_size * 2;
+ p += 16;
+ } while (--i);
+}
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c
index d01ff1c0f8..fa590066d6 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -229,7 +229,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
if(s->mb_intra) block[0]= level;
else block[0]= temp_block[0];
- if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
+ if (s->idsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM) {
if(last_non_zero_p1 <= 1) goto end;
block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
block[0x20] = temp_block[0x10];
diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c
index 68ad929067..a66fc70982 100644
--- a/libavcodec/x86/proresdsp_init.c
+++ b/libavcodec/x86/proresdsp_init.c
@@ -22,7 +22,7 @@
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
-#include "libavcodec/dsputil.h"
+#include "libavcodec/idctdsp.h"
#include "libavcodec/proresdsp.h"
void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c
index a342110cd3..bbe5a67472 100644
--- a/libavcodec/x86/simple_idct.c
+++ b/libavcodec/x86/simple_idct.c
@@ -23,7 +23,7 @@
#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
-#include "dsputil_x86.h"
+#include "idctdsp.h"
#if HAVE_INLINE_ASM