summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorDiego Biurrun <diego@biurrun.de>2008-12-22 09:12:42 +0000
committerDiego Biurrun <diego@biurrun.de>2008-12-22 09:12:42 +0000
commita6493a8fbd979eb96898d910b8a64df7c5eee6a5 (patch)
tree6a73c038bcc48265a8ddc4d08635b65fc8e8e706 /libavcodec/x86
parent50e3477f0f2aabcc4a192af39c4a4f87da66bb85 (diff)
Rename libavcodec/i386/ --> libavcodec/x86/.
It contains optimizations that are not specific to i386 and libavutil uses this naming scheme already. Originally committed as revision 16270 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/cavsdsp_mmx.c497
-rw-r--r--libavcodec/x86/cpuid.c134
-rw-r--r--libavcodec/x86/dnxhd_mmx.c58
-rw-r--r--libavcodec/x86/dsputil_h264_template_mmx.c308
-rw-r--r--libavcodec/x86/dsputil_h264_template_ssse3.c208
-rw-r--r--libavcodec/x86/dsputil_mmx.c2976
-rw-r--r--libavcodec/x86/dsputil_mmx.h154
-rw-r--r--libavcodec/x86/dsputil_mmx_avg_template.c896
-rw-r--r--libavcodec/x86/dsputil_mmx_qns_template.c101
-rw-r--r--libavcodec/x86/dsputil_mmx_rnd_template.c590
-rw-r--r--libavcodec/x86/dsputil_yasm.asm92
-rw-r--r--libavcodec/x86/dsputilenc_mmx.c1441
-rw-r--r--libavcodec/x86/fdct_mmx.c580
-rw-r--r--libavcodec/x86/fft_3dn.c23
-rw-r--r--libavcodec/x86/fft_3dn2.c173
-rw-r--r--libavcodec/x86/fft_mmx.asm467
-rw-r--r--libavcodec/x86/fft_sse.c202
-rw-r--r--libavcodec/x86/flacdsp_mmx.c139
-rw-r--r--libavcodec/x86/h264_deblock_sse2.asm747
-rw-r--r--libavcodec/x86/h264_i386.h155
-rw-r--r--libavcodec/x86/h264dsp_mmx.c2208
-rw-r--r--libavcodec/x86/idct_mmx.c605
-rw-r--r--libavcodec/x86/idct_mmx_xvid.c525
-rw-r--r--libavcodec/x86/idct_sse2_xvid.c394
-rw-r--r--libavcodec/x86/idct_xvid.h37
-rw-r--r--libavcodec/x86/mathops.h43
-rw-r--r--libavcodec/x86/mmx.h267
-rw-r--r--libavcodec/x86/motion_est_mmx.c461
-rw-r--r--libavcodec/x86/mpegvideo_mmx.c654
-rw-r--r--libavcodec/x86/mpegvideo_mmx_template.c376
-rw-r--r--libavcodec/x86/simple_idct_mmx.c1294
-rw-r--r--libavcodec/x86/snowdsp_mmx.c871
-rw-r--r--libavcodec/x86/vc1dsp_mmx.c490
-rw-r--r--libavcodec/x86/vp3dsp_mmx.c396
-rw-r--r--libavcodec/x86/vp3dsp_mmx.h35
-rw-r--r--libavcodec/x86/vp3dsp_sse2.c186
-rw-r--r--libavcodec/x86/vp3dsp_sse2.h31
-rw-r--r--libavcodec/x86/x86inc.asm540
38 files changed, 19354 insertions, 0 deletions
diff --git a/libavcodec/x86/cavsdsp_mmx.c b/libavcodec/x86/cavsdsp_mmx.c
new file mode 100644
index 0000000000..2000ba524a
--- /dev/null
+++ b/libavcodec/x86/cavsdsp_mmx.c
@@ -0,0 +1,497 @@
+/*
+ * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
+ * Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
+ *
+ * MMX-optimized DSP functions, based on H.264 optimizations by
+ * Michael Niedermayer and Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+#include "dsputil_mmx.h"
+
+/*****************************************************************************
+ *
+ * inverse transform
+ *
+ ****************************************************************************/
+
+static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
+{
+ __asm__ volatile(
+ "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */
+ "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */
+ "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */
+ "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */
+ "movq %%mm4, %%mm0 \n\t"
+ "movq %%mm5, %%mm3 \n\t"
+ "movq %%mm2, %%mm6 \n\t"
+ "movq %%mm7, %%mm1 \n\t"
+
+ "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */
+ "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */
+ "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */
+ "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */
+ "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */
+ "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */
+ "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */
+ "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */
+ "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
+ "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
+ "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
+ "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
+
+ "movq %%mm5, %%mm4 \n\t"
+ "movq %%mm7, %%mm6 \n\t"
+ "movq %%mm3, %%mm0 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */
+ "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */
+ "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */
+ "paddw %%mm7, %%mm7 \n\t"
+ "paddw %%mm5, %%mm5 \n\t"
+ "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */
+ "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */
+
+ SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */
+ "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */
+ "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */
+ "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */
+ "paddw %%mm1, %%mm1 \n\t"
+ "paddw %%mm3, %%mm3 \n\t"
+ "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */
+ "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */
+
+ "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */
+ "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */
+ "movq %%mm2, %%mm4 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */
+ "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */
+ "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */
+ "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */
+ "paddw %%mm2, %%mm2 \n\t"
+ "paddw %%mm0, %%mm0 \n\t"
+ "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
+ "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
+
+ "movq (%0), %%mm2 \n\t" /* mm2 = src0 */
+ "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */
+ SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */
+ "psllw $3, %%mm0 \n\t"
+ "psllw $3, %%mm2 \n\t"
+ "paddw %1, %%mm0 \n\t" /* add rounding bias */
+ "paddw %1, %%mm2 \n\t" /* add rounding bias */
+
+ SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */
+ SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */
+ SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */
+ SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */
+ SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */
+ SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */
+ :: "r"(block), "m"(bias)
+ );
+}
+
+static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+ int i;
+ DECLARE_ALIGNED_8(int16_t, b2[64]);
+
+ for(i=0; i<2; i++){
+ DECLARE_ALIGNED_8(uint64_t, tmp);
+
+ cavs_idct8_1d(block+4*i, ff_pw_4);
+
+ __asm__ volatile(
+ "psraw $3, %%mm7 \n\t"
+ "psraw $3, %%mm6 \n\t"
+ "psraw $3, %%mm5 \n\t"
+ "psraw $3, %%mm4 \n\t"
+ "psraw $3, %%mm3 \n\t"
+ "psraw $3, %%mm2 \n\t"
+ "psraw $3, %%mm1 \n\t"
+ "psraw $3, %%mm0 \n\t"
+ "movq %%mm7, %0 \n\t"
+ TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
+ "movq %%mm0, 8(%1) \n\t"
+ "movq %%mm6, 24(%1) \n\t"
+ "movq %%mm7, 40(%1) \n\t"
+ "movq %%mm4, 56(%1) \n\t"
+ "movq %0, %%mm7 \n\t"
+ TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
+ "movq %%mm7, (%1) \n\t"
+ "movq %%mm1, 16(%1) \n\t"
+ "movq %%mm0, 32(%1) \n\t"
+ "movq %%mm3, 48(%1) \n\t"
+ : "=m"(tmp)
+ : "r"(b2+32*i)
+ : "memory"
+ );
+ }
+
+ for(i=0; i<2; i++){
+ cavs_idct8_1d(b2+4*i, ff_pw_64);
+
+ __asm__ volatile(
+ "psraw $7, %%mm7 \n\t"
+ "psraw $7, %%mm6 \n\t"
+ "psraw $7, %%mm5 \n\t"
+ "psraw $7, %%mm4 \n\t"
+ "psraw $7, %%mm3 \n\t"
+ "psraw $7, %%mm2 \n\t"
+ "psraw $7, %%mm1 \n\t"
+ "psraw $7, %%mm0 \n\t"
+ "movq %%mm7, (%0) \n\t"
+ "movq %%mm5, 16(%0) \n\t"
+ "movq %%mm3, 32(%0) \n\t"
+ "movq %%mm1, 48(%0) \n\t"
+ "movq %%mm0, 64(%0) \n\t"
+ "movq %%mm2, 80(%0) \n\t"
+ "movq %%mm4, 96(%0) \n\t"
+ "movq %%mm6, 112(%0) \n\t"
+ :: "r"(b2+4*i)
+ : "memory"
+ );
+ }
+
+ add_pixels_clamped_mmx(b2, dst, stride);
+
+ /* clear block */
+ __asm__ volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "movq %%mm7, (%0) \n\t"
+ "movq %%mm7, 8(%0) \n\t"
+ "movq %%mm7, 16(%0) \n\t"
+ "movq %%mm7, 24(%0) \n\t"
+ "movq %%mm7, 32(%0) \n\t"
+ "movq %%mm7, 40(%0) \n\t"
+ "movq %%mm7, 48(%0) \n\t"
+ "movq %%mm7, 56(%0) \n\t"
+ "movq %%mm7, 64(%0) \n\t"
+ "movq %%mm7, 72(%0) \n\t"
+ "movq %%mm7, 80(%0) \n\t"
+ "movq %%mm7, 88(%0) \n\t"
+ "movq %%mm7, 96(%0) \n\t"
+ "movq %%mm7, 104(%0) \n\t"
+ "movq %%mm7, 112(%0) \n\t"
+ "movq %%mm7, 120(%0) \n\t"
+ :: "r" (block)
+ );
+}
+
+/*****************************************************************************
+ *
+ * motion compensation
+ *
+ ****************************************************************************/
+
+/* vertical filter [-1 -2 96 42 -7 0] */
+#define QPEL_CAVSV1(A,B,C,D,E,F,OP) \
+ "movd (%0), "#F" \n\t"\
+ "movq "#C", %%mm6 \n\t"\
+ "pmullw %5, %%mm6 \n\t"\
+ "movq "#D", %%mm7 \n\t"\
+ "pmullw %6, %%mm7 \n\t"\
+ "psllw $3, "#E" \n\t"\
+ "psubw "#E", %%mm6 \n\t"\
+ "psraw $3, "#E" \n\t"\
+ "paddw %%mm7, %%mm6 \n\t"\
+ "paddw "#E", %%mm6 \n\t"\
+ "paddw "#B", "#B" \n\t"\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, "#F" \n\t"\
+ "psubw "#B", %%mm6 \n\t"\
+ "psraw $1, "#B" \n\t"\
+ "psubw "#A", %%mm6 \n\t"\
+ "paddw %4, %%mm6 \n\t"\
+ "psraw $7, %%mm6 \n\t"\
+ "packuswb %%mm6, %%mm6 \n\t"\
+ OP(%%mm6, (%1), A, d) \
+ "add %3, %1 \n\t"
+
+/* vertical filter [ 0 -1 5 5 -1 0] */
+#define QPEL_CAVSV2(A,B,C,D,E,F,OP) \
+ "movd (%0), "#F" \n\t"\
+ "movq "#C", %%mm6 \n\t"\
+ "paddw "#D", %%mm6 \n\t"\
+ "pmullw %5, %%mm6 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, "#F" \n\t"\
+ "psubw "#B", %%mm6 \n\t"\
+ "psubw "#E", %%mm6 \n\t"\
+ "paddw %4, %%mm6 \n\t"\
+ "psraw $3, %%mm6 \n\t"\
+ "packuswb %%mm6, %%mm6 \n\t"\
+ OP(%%mm6, (%1), A, d) \
+ "add %3, %1 \n\t"
+
+/* vertical filter [ 0 -7 42 96 -2 -1] */
+#define QPEL_CAVSV3(A,B,C,D,E,F,OP) \
+ "movd (%0), "#F" \n\t"\
+ "movq "#C", %%mm6 \n\t"\
+ "pmullw %6, %%mm6 \n\t"\
+ "movq "#D", %%mm7 \n\t"\
+ "pmullw %5, %%mm7 \n\t"\
+ "psllw $3, "#B" \n\t"\
+ "psubw "#B", %%mm6 \n\t"\
+ "psraw $3, "#B" \n\t"\
+ "paddw %%mm7, %%mm6 \n\t"\
+ "paddw "#B", %%mm6 \n\t"\
+ "paddw "#E", "#E" \n\t"\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, "#F" \n\t"\
+ "psubw "#E", %%mm6 \n\t"\
+ "psraw $1, "#E" \n\t"\
+ "psubw "#F", %%mm6 \n\t"\
+ "paddw %4, %%mm6 \n\t"\
+ "psraw $7, %%mm6 \n\t"\
+ "packuswb %%mm6, %%mm6 \n\t"\
+ OP(%%mm6, (%1), A, d) \
+ "add %3, %1 \n\t"
+
+
+#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
+ int w= 2;\
+ src -= 2*srcStride;\
+ \
+ while(w--){\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd (%0), %%mm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+ VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+ VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\
+ : "memory"\
+ );\
+ if(h==16){\
+ __asm__ volatile(\
+ VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+ VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+ VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\
+ : "memory"\
+ );\
+ }\
+ src += 4-(h+5)*srcStride;\
+ dst += 4-h*dstStride;\
+ }
+
+#define QPEL_CAVS(OPNAME, OP, MMX)\
+static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ int h=8;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %5, %%mm6 \n\t"\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 1(%0), %%mm2 \n\t"\
+ "movq %%mm0, %%mm1 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "pmullw %%mm6, %%mm0 \n\t"\
+ "pmullw %%mm6, %%mm1 \n\t"\
+ "movq -1(%0), %%mm2 \n\t"\
+ "movq 2(%0), %%mm4 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "movq %%mm4, %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm7, %%mm5 \n\t"\
+ "paddw %%mm4, %%mm2 \n\t"\
+ "paddw %%mm3, %%mm5 \n\t"\
+ "psubw %%mm2, %%mm0 \n\t"\
+ "psubw %%mm5, %%mm1 \n\t"\
+ "movq %6, %%mm5 \n\t"\
+ "paddw %%mm5, %%mm0 \n\t"\
+ "paddw %%mm5, %%mm1 \n\t"\
+ "psraw $3, %%mm0 \n\t"\
+ "psraw $3, %%mm1 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm5, q) \
+ "add %3, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+m"(h)\
+ : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
+ : "memory"\
+ );\
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \
+}\
+\
+static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
+}\
+\
+static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+ src += 8*srcStride;\
+ dst += 8*dstStride;\
+ OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define CAVS_MC(OPNAME, SIZE, MMX) \
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
+}\
+
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp " \n\t"\
+"pavgusb " #temp ", " #a " \n\t"\
+"mov" #size " " #a ", " #b " \n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp " \n\t"\
+"pavgb " #temp ", " #a " \n\t"\
+"mov" #size " " #a ", " #b " \n\t"
+
+QPEL_CAVS(put_, PUT_OP, 3dnow)
+QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
+QPEL_CAVS(put_, PUT_OP, mmx2)
+QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2)
+
+CAVS_MC(put_, 8, 3dnow)
+CAVS_MC(put_, 16,3dnow)
+CAVS_MC(avg_, 8, 3dnow)
+CAVS_MC(avg_, 16,3dnow)
+CAVS_MC(put_, 8, mmx2)
+CAVS_MC(put_, 16,mmx2)
+CAVS_MC(avg_, 8, mmx2)
+CAVS_MC(avg_, 16,mmx2)
+
+void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
+
+void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx) {
+#define dspfunc(PFX, IDX, NUM) \
+ c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \
+ c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \
+ c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \
+ c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \
+ c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \
+
+ dspfunc(put_cavs_qpel, 0, 16);
+ dspfunc(put_cavs_qpel, 1, 8);
+ dspfunc(avg_cavs_qpel, 0, 16);
+ dspfunc(avg_cavs_qpel, 1, 8);
+#undef dspfunc
+ c->cavs_idct8_add = cavs_idct8_add_mmx;
+}
+
+void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx) {
+#define dspfunc(PFX, IDX, NUM) \
+ c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \
+ c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \
+ c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \
+ c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \
+ c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \
+
+ dspfunc(put_cavs_qpel, 0, 16);
+ dspfunc(put_cavs_qpel, 1, 8);
+ dspfunc(avg_cavs_qpel, 0, 16);
+ dspfunc(avg_cavs_qpel, 1, 8);
+#undef dspfunc
+ c->cavs_idct8_add = cavs_idct8_add_mmx;
+}
diff --git a/libavcodec/x86/cpuid.c b/libavcodec/x86/cpuid.c
new file mode 100644
index 0000000000..cd33334ecf
--- /dev/null
+++ b/libavcodec/x86/cpuid.c
@@ -0,0 +1,134 @@
+/*
+ * CPU detection code, extracted from mmx.h
+ * (c)1997-99 by H. Dietz and R. Fisher
+ * Converted to C and improved by Fabrice Bellard.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdlib.h>
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+
+#undef printf
+
+/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
+#define cpuid(index,eax,ebx,ecx,edx)\
+ __asm__ volatile\
+ ("mov %%"REG_b", %%"REG_S"\n\t"\
+ "cpuid\n\t"\
+ "xchg %%"REG_b", %%"REG_S\
+ : "=a" (eax), "=S" (ebx),\
+ "=c" (ecx), "=d" (edx)\
+ : "0" (index));
+
+/* Function to test if multimedia instructions are supported... */
+int mm_support(void)
+{
+ int rval = 0;
+ int eax, ebx, ecx, edx;
+ int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
+ x86_reg a, c;
+
+#ifdef ARCH_X86_64
+#define PUSHF "pushfq\n\t"
+#define POPF "popfq\n\t"
+#else
+#define PUSHF "pushfl\n\t"
+#define POPF "popfl\n\t"
+#endif
+ __asm__ volatile (
+ /* See if CPUID instruction is supported ... */
+ /* ... Get copies of EFLAGS into eax and ecx */
+ PUSHF
+ "pop %0\n\t"
+ "mov %0, %1\n\t"
+
+ /* ... Toggle the ID bit in one copy and store */
+ /* to the EFLAGS reg */
+ "xor $0x200000, %0\n\t"
+ "push %0\n\t"
+ POPF
+
+ /* ... Get the (hopefully modified) EFLAGS */
+ PUSHF
+ "pop %0\n\t"
+ : "=a" (a), "=c" (c)
+ :
+ : "cc"
+ );
+
+ if (a == c)
+ return 0; /* CPUID not supported */
+
+ cpuid(0, max_std_level, ebx, ecx, edx);
+
+ if(max_std_level >= 1){
+ cpuid(1, eax, ebx, ecx, std_caps);
+ if (std_caps & (1<<23))
+ rval |= FF_MM_MMX;
+ if (std_caps & (1<<25))
+ rval |= FF_MM_MMXEXT
+#if !defined(__GNUC__) || __GNUC__ > 2
+ | FF_MM_SSE;
+ if (std_caps & (1<<26))
+ rval |= FF_MM_SSE2;
+ if (ecx & 1)
+ rval |= FF_MM_SSE3;
+ if (ecx & 0x00000200 )
+ rval |= FF_MM_SSSE3
+#endif
+ ;
+ }
+
+ cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
+
+ if(max_ext_level >= 0x80000001){
+ cpuid(0x80000001, eax, ebx, ecx, ext_caps);
+ if (ext_caps & (1<<31))
+ rval |= FF_MM_3DNOW;
+ if (ext_caps & (1<<30))
+ rval |= FF_MM_3DNOWEXT;
+ if (ext_caps & (1<<23))
+ rval |= FF_MM_MMX;
+ if (ext_caps & (1<<22))
+ rval |= FF_MM_MMXEXT;
+ }
+
+#if 0
+ av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s\n",
+ (rval&FF_MM_MMX) ? "MMX ":"",
+ (rval&FF_MM_MMXEXT) ? "MMX2 ":"",
+ (rval&FF_MM_SSE) ? "SSE ":"",
+ (rval&FF_MM_SSE2) ? "SSE2 ":"",
+ (rval&FF_MM_SSE3) ? "SSE3 ":"",
+ (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
+ (rval&FF_MM_3DNOW) ? "3DNow ":"",
+ (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":"");
+#endif
+ return rval;
+}
+
+#ifdef TEST
+int main ( void )
+{
+ int mm_flags;
+ mm_flags = mm_support();
+ printf("mm_support = 0x%08X\n",mm_flags);
+ return 0;
+}
+#endif
diff --git a/libavcodec/x86/dnxhd_mmx.c b/libavcodec/x86/dnxhd_mmx.c
new file mode 100644
index 0000000000..59bcb3929b
--- /dev/null
+++ b/libavcodec/x86/dnxhd_mmx.c
@@ -0,0 +1,58 @@
+/*
+ * VC3/DNxHD SIMD functions
+ * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
+ *
+ * VC-3 encoder funded by the British Broadcasting Corporation
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dnxhdenc.h"
+
+static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
+{
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7 \n\t"
+ "movq (%0), %%xmm0 \n\t"
+ "add %2, %0 \n\t"
+ "movq (%0), %%xmm1 \n\t"
+ "movq (%0, %2), %%xmm2 \n\t"
+ "movq (%0, %2,2), %%xmm3 \n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "punpcklbw %%xmm7, %%xmm2 \n\t"
+ "punpcklbw %%xmm7, %%xmm3 \n\t"
+ "movdqa %%xmm0, (%1) \n\t"
+ "movdqa %%xmm1, 16(%1) \n\t"
+ "movdqa %%xmm2, 32(%1) \n\t"
+ "movdqa %%xmm3, 48(%1) \n\t"
+ "movdqa %%xmm3 , 64(%1) \n\t"
+ "movdqa %%xmm2 , 80(%1) \n\t"
+ "movdqa %%xmm1 , 96(%1) \n\t"
+ "movdqa %%xmm0, 112(%1) \n\t"
+ : "+r" (pixels)
+ : "r" (block), "r" ((x86_reg)line_size)
+ );
+}
+
+void ff_dnxhd_init_mmx(DNXHDEncContext *ctx)
+{
+ if (mm_flags & FF_MM_SSE2) {
+ ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
+ }
+}
diff --git a/libavcodec/x86/dsputil_h264_template_mmx.c b/libavcodec/x86/dsputil_h264_template_mmx.c
new file mode 100644
index 0000000000..0bf8732e35
--- /dev/null
+++ b/libavcodec/x86/dsputil_h264_template_mmx.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
+ * Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * MMX optimized version of (put|avg)_h264_chroma_mc8.
+ * H264_CHROMA_MC8_TMPL must be defined to the desired function name
+ * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
+ * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
+ */
+static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
+{
+ const uint64_t *rnd_reg;
+ DECLARE_ALIGNED_8(uint64_t, AA);
+ DECLARE_ALIGNED_8(uint64_t, DD);
+ int i;
+
+ if(y==0 && x==0) {
+ /* no filter needed */
+ H264_CHROMA_MC8_MV0(dst, src, stride, h);
+ return;
+ }
+
+ assert(x<8 && y<8 && x>=0 && y>=0);
+
+ if(y==0 || x==0)
+ {
+ /* 1 dimensional filter only */
+ const int dxy = x ? 1 : stride;
+
+ rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3;
+
+ __asm__ volatile(
+ "movd %0, %%mm5\n\t"
+ "movq %1, %%mm4\n\t"
+ "movq %2, %%mm6\n\t" /* mm6 = rnd */
+ "punpcklwd %%mm5, %%mm5\n\t"
+ "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
+ "pxor %%mm7, %%mm7\n\t"
+ "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */
+ :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg));
+
+ for(i=0; i<h; i++) {
+ __asm__ volatile(
+ /* mm0 = src[0..7], mm1 = src[1..8] */
+ "movq %0, %%mm0\n\t"
+ "movq %1, %%mm2\n\t"
+ :: "m"(src[0]), "m"(src[dxy]));
+
+ __asm__ volatile(
+ /* [mm0,mm1] = A * src[0..7] */
+ /* [mm2,mm3] = B * src[1..8] */
+ "movq %%mm0, %%mm1\n\t"
+ "movq %%mm2, %%mm3\n\t"
+ "punpcklbw %%mm7, %%mm0\n\t"
+ "punpckhbw %%mm7, %%mm1\n\t"
+ "punpcklbw %%mm7, %%mm2\n\t"
+ "punpckhbw %%mm7, %%mm3\n\t"
+ "pmullw %%mm4, %%mm0\n\t"
+ "pmullw %%mm4, %%mm1\n\t"
+ "pmullw %%mm5, %%mm2\n\t"
+ "pmullw %%mm5, %%mm3\n\t"
+
+ /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */
+ "paddw %%mm6, %%mm0\n\t"
+ "paddw %%mm6, %%mm1\n\t"
+ "paddw %%mm2, %%mm0\n\t"
+ "paddw %%mm3, %%mm1\n\t"
+ "psrlw $3, %%mm0\n\t"
+ "psrlw $3, %%mm1\n\t"
+ "packuswb %%mm1, %%mm0\n\t"
+ H264_CHROMA_OP(%0, %%mm0)
+ "movq %%mm0, %0\n\t"
+ : "=m" (dst[0]));
+
+ src += stride;
+ dst += stride;
+ }
+ return;
+ }
+
+ /* general case, bilinear */
+ rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a;
+ __asm__ volatile("movd %2, %%mm4\n\t"
+ "movd %3, %%mm6\n\t"
+ "punpcklwd %%mm4, %%mm4\n\t"
+ "punpcklwd %%mm6, %%mm6\n\t"
+ "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
+ "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
+ "movq %%mm4, %%mm5\n\t"
+ "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */
+ "psllw $3, %%mm5\n\t"
+ "psllw $3, %%mm6\n\t"
+ "movq %%mm5, %%mm7\n\t"
+ "paddw %%mm6, %%mm7\n\t"
+ "movq %%mm4, %1\n\t" /* DD = x * y */
+ "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */
+ "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */
+ "paddw %4, %%mm4\n\t"
+ "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */
+ "pxor %%mm7, %%mm7\n\t"
+ "movq %%mm4, %0\n\t"
+ : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
+
+ __asm__ volatile(
+ /* mm0 = src[0..7], mm1 = src[1..8] */
+ "movq %0, %%mm0\n\t"
+ "movq %1, %%mm1\n\t"
+ : : "m" (src[0]), "m" (src[1]));
+
+ for(i=0; i<h; i++) {
+ src += stride;
+
+ __asm__ volatile(
+ /* mm2 = A * src[0..3] + B * src[1..4] */
+ /* mm3 = A * src[4..7] + B * src[5..8] */
+ "movq %%mm0, %%mm2\n\t"
+ "movq %%mm1, %%mm3\n\t"
+ "punpckhbw %%mm7, %%mm0\n\t"
+ "punpcklbw %%mm7, %%mm1\n\t"
+ "punpcklbw %%mm7, %%mm2\n\t"
+ "punpckhbw %%mm7, %%mm3\n\t"
+ "pmullw %0, %%mm0\n\t"
+ "pmullw %0, %%mm2\n\t"
+ "pmullw %%mm5, %%mm1\n\t"
+ "pmullw %%mm5, %%mm3\n\t"
+ "paddw %%mm1, %%mm2\n\t"
+ "paddw %%mm0, %%mm3\n\t"
+ : : "m" (AA));
+
+ __asm__ volatile(
+ /* [mm2,mm3] += C * src[0..7] */
+ "movq %0, %%mm0\n\t"
+ "movq %%mm0, %%mm1\n\t"
+ "punpcklbw %%mm7, %%mm0\n\t"
+ "punpckhbw %%mm7, %%mm1\n\t"
+ "pmullw %%mm6, %%mm0\n\t"
+ "pmullw %%mm6, %%mm1\n\t"
+ "paddw %%mm0, %%mm2\n\t"
+ "paddw %%mm1, %%mm3\n\t"
+ : : "m" (src[0]));
+
+ __asm__ volatile(
+ /* [mm2,mm3] += D * src[1..8] */
+ "movq %1, %%mm1\n\t"
+ "movq %%mm1, %%mm0\n\t"
+ "movq %%mm1, %%mm4\n\t"
+ "punpcklbw %%mm7, %%mm0\n\t"
+ "punpckhbw %%mm7, %%mm4\n\t"
+ "pmullw %2, %%mm0\n\t"
+ "pmullw %2, %%mm4\n\t"
+ "paddw %%mm0, %%mm2\n\t"
+ "paddw %%mm4, %%mm3\n\t"
+ "movq %0, %%mm0\n\t"
+ : : "m" (src[0]), "m" (src[1]), "m" (DD));
+
+ __asm__ volatile(
+ /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */
+ "paddw %1, %%mm2\n\t"
+ "paddw %1, %%mm3\n\t"
+ "psrlw $6, %%mm2\n\t"
+ "psrlw $6, %%mm3\n\t"
+ "packuswb %%mm3, %%mm2\n\t"
+ H264_CHROMA_OP(%0, %%mm2)
+ "movq %%mm2, %0\n\t"
+ : "=m" (dst[0]) : "m" (*rnd_reg));
+ dst+= stride;
+ }
+}
+
+static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+ __asm__ volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "movd %5, %%mm2 \n\t"
+ "movd %6, %%mm3 \n\t"
+ "movq "MANGLE(ff_pw_8)", %%mm4\n\t"
+ "movq "MANGLE(ff_pw_8)", %%mm5\n\t"
+ "punpcklwd %%mm2, %%mm2 \n\t"
+ "punpcklwd %%mm3, %%mm3 \n\t"
+ "punpcklwd %%mm2, %%mm2 \n\t"
+ "punpcklwd %%mm3, %%mm3 \n\t"
+ "psubw %%mm2, %%mm4 \n\t"
+ "psubw %%mm3, %%mm5 \n\t"
+
+ "movd (%1), %%mm0 \n\t"
+ "movd 1(%1), %%mm6 \n\t"
+ "add %3, %1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+ "pmullw %%mm4, %%mm0 \n\t"
+ "pmullw %%mm2, %%mm6 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+
+ "1: \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd 1(%1), %%mm1 \n\t"
+ "add %3, %1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "pmullw %%mm4, %%mm0 \n\t"
+ "pmullw %%mm2, %%mm1 \n\t"
+ "paddw %%mm0, %%mm1 \n\t"
+ "movq %%mm1, %%mm0 \n\t"
+ "pmullw %%mm5, %%mm6 \n\t"
+ "pmullw %%mm3, %%mm1 \n\t"
+ "paddw %4, %%mm6 \n\t"
+ "paddw %%mm6, %%mm1 \n\t"
+ "psrlw $6, %%mm1 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ H264_CHROMA_OP4((%0), %%mm1, %%mm6)
+ "movd %%mm1, (%0) \n\t"
+ "add %3, %0 \n\t"
+ "movd (%1), %%mm6 \n\t"
+ "movd 1(%1), %%mm1 \n\t"
+ "add %3, %1 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "pmullw %%mm4, %%mm6 \n\t"
+ "pmullw %%mm2, %%mm1 \n\t"
+ "paddw %%mm6, %%mm1 \n\t"
+ "movq %%mm1, %%mm6 \n\t"
+ "pmullw %%mm5, %%mm0 \n\t"
+ "pmullw %%mm3, %%mm1 \n\t"
+ "paddw %4, %%mm0 \n\t"
+ "paddw %%mm0, %%mm1 \n\t"
+ "psrlw $6, %%mm1 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ H264_CHROMA_OP4((%0), %%mm1, %%mm0)
+ "movd %%mm1, (%0) \n\t"
+ "add %3, %0 \n\t"
+ "sub $2, %2 \n\t"
+ "jnz 1b \n\t"
+ : "+r"(dst), "+r"(src), "+r"(h)
+ : "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y)
+ );
+}
+
+#ifdef H264_CHROMA_MC2_TMPL
+static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+ int tmp = ((1<<16)-1)*x + 8;
+ int CD= tmp*y;
+ int AB= (tmp<<3) - CD;
+ __asm__ volatile(
+ /* mm5 = {A,B,A,B} */
+ /* mm6 = {C,D,C,D} */
+ "movd %0, %%mm5\n\t"
+ "movd %1, %%mm6\n\t"
+ "punpckldq %%mm5, %%mm5\n\t"
+ "punpckldq %%mm6, %%mm6\n\t"
+ "pxor %%mm7, %%mm7\n\t"
+ /* mm0 = src[0,1,1,2] */
+ "movd %2, %%mm2\n\t"
+ "punpcklbw %%mm7, %%mm2\n\t"
+ "pshufw $0x94, %%mm2, %%mm2\n\t"
+ :: "r"(AB), "r"(CD), "m"(src[0]));
+
+
+ __asm__ volatile(
+ "1:\n\t"
+ "add %4, %1\n\t"
+ /* mm1 = A * src[0,1] + B * src[1,2] */
+ "movq %%mm2, %%mm1\n\t"
+ "pmaddwd %%mm5, %%mm1\n\t"
+ /* mm0 = src[0,1,1,2] */
+ "movd (%1), %%mm0\n\t"
+ "punpcklbw %%mm7, %%mm0\n\t"
+ "pshufw $0x94, %%mm0, %%mm0\n\t"
+ /* mm1 += C * src[0,1] + D * src[1,2] */
+ "movq %%mm0, %%mm2\n\t"
+ "pmaddwd %%mm6, %%mm0\n\t"
+ "paddw %3, %%mm1\n\t"
+ "paddw %%mm0, %%mm1\n\t"
+ /* dst[0,1] = pack((mm1 + 32) >> 6) */
+ "psrlw $6, %%mm1\n\t"
+ "packssdw %%mm7, %%mm1\n\t"
+ "packuswb %%mm7, %%mm1\n\t"
+ H264_CHROMA_OP4((%0), %%mm1, %%mm3)
+ "movd %%mm1, %%esi\n\t"
+ "movw %%si, (%0)\n\t"
+ "add %4, %0\n\t"
+ "sub $1, %2\n\t"
+ "jnz 1b\n\t"
+ : "+r" (dst), "+r"(src), "+r"(h)
+ : "m" (ff_pw_32), "r"((x86_reg)stride)
+ : "%esi");
+
+}
+#endif
+
diff --git a/libavcodec/x86/dsputil_h264_template_ssse3.c b/libavcodec/x86/dsputil_h264_template_ssse3.c
new file mode 100644
index 0000000000..e29e05e7c8
--- /dev/null
+++ b/libavcodec/x86/dsputil_h264_template_ssse3.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2008 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
+ * H264_CHROMA_MC8_TMPL must be defined to the desired function name
+ * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
+ * AVG_OP must be defined to empty for put and the identify for avg
+ */
+static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
+{
+ if(y==0 && x==0) {
+ /* no filter needed */
+ H264_CHROMA_MC8_MV0(dst, src, stride, h);
+ return;
+ }
+
+ assert(x<8 && y<8 && x>=0 && y>=0);
+
+ if(y==0 || x==0)
+ {
+ /* 1 dimensional filter only */
+ __asm__ volatile(
+ "movd %0, %%xmm7 \n\t"
+ "movq %1, %%xmm6 \n\t"
+ "pshuflw $0, %%xmm7, %%xmm7 \n\t"
+ "movlhps %%xmm6, %%xmm6 \n\t"
+ "movlhps %%xmm7, %%xmm7 \n\t"
+ :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3))
+ );
+
+ if(x) {
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%1), %%xmm0 \n\t"
+ "movq 1(%1), %%xmm1 \n\t"
+ "movq (%1,%3), %%xmm2 \n\t"
+ "movq 1(%1,%3), %%xmm3 \n\t"
+ "punpcklbw %%xmm1, %%xmm0 \n\t"
+ "punpcklbw %%xmm3, %%xmm2 \n\t"
+ "pmaddubsw %%xmm7, %%xmm0 \n\t"
+ "pmaddubsw %%xmm7, %%xmm2 \n\t"
+ AVG_OP("movq (%0), %%xmm4 \n\t")
+ AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
+ "paddw %%xmm6, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm2 \n\t"
+ "psrlw $3, %%xmm0 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "packuswb %%xmm2, %%xmm0 \n\t"
+ AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
+ "movq %%xmm0, (%0) \n\t"
+ "movhps %%xmm0, (%0,%3) \n\t"
+ "sub $2, %2 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%0,%3,2), %0 \n\t"
+ "jg 1b \n\t"
+ :"+r"(dst), "+r"(src), "+r"(h)
+ :"r"((x86_reg)stride)
+ );
+ } else {
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%1), %%xmm0 \n\t"
+ "movq (%1,%3), %%xmm1 \n\t"
+ "movdqa %%xmm1, %%xmm2 \n\t"
+ "movq (%1,%3,2), %%xmm3 \n\t"
+ "punpcklbw %%xmm1, %%xmm0 \n\t"
+ "punpcklbw %%xmm3, %%xmm2 \n\t"
+ "pmaddubsw %%xmm7, %%xmm0 \n\t"
+ "pmaddubsw %%xmm7, %%xmm2 \n\t"
+ AVG_OP("movq (%0), %%xmm4 \n\t")
+ AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
+ "paddw %%xmm6, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm2 \n\t"
+ "psrlw $3, %%xmm0 \n\t"
+ "psrlw $3, %%xmm2 \n\t"
+ "packuswb %%xmm2, %%xmm0 \n\t"
+ AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
+ "movq %%xmm0, (%0) \n\t"
+ "movhps %%xmm0, (%0,%3) \n\t"
+ "sub $2, %2 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%0,%3,2), %0 \n\t"
+ "jg 1b \n\t"
+ :"+r"(dst), "+r"(src), "+r"(h)
+ :"r"((x86_reg)stride)
+ );
+ }
+ return;
+ }
+
+ /* general case, bilinear */
+ __asm__ volatile(
+ "movd %0, %%xmm7 \n\t"
+ "movd %1, %%xmm6 \n\t"
+ "movdqa %2, %%xmm5 \n\t"
+ "pshuflw $0, %%xmm7, %%xmm7 \n\t"
+ "pshuflw $0, %%xmm6, %%xmm6 \n\t"
+ "movlhps %%xmm7, %%xmm7 \n\t"
+ "movlhps %%xmm6, %%xmm6 \n\t"
+ :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28))
+ );
+
+ __asm__ volatile(
+ "movq (%1), %%xmm0 \n\t"
+ "movq 1(%1), %%xmm1 \n\t"
+ "punpcklbw %%xmm1, %%xmm0 \n\t"
+ "add %3, %1 \n\t"
+ "1: \n\t"
+ "movq (%1), %%xmm1 \n\t"
+ "movq 1(%1), %%xmm2 \n\t"
+ "movq (%1,%3), %%xmm3 \n\t"
+ "movq 1(%1,%3), %%xmm4 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "punpcklbw %%xmm2, %%xmm1 \n\t"
+ "punpcklbw %%xmm4, %%xmm3 \n\t"
+ "movdqa %%xmm1, %%xmm2 \n\t"
+ "movdqa %%xmm3, %%xmm4 \n\t"
+ "pmaddubsw %%xmm7, %%xmm0 \n\t"
+ "pmaddubsw %%xmm6, %%xmm1 \n\t"
+ "pmaddubsw %%xmm7, %%xmm2 \n\t"
+ "pmaddubsw %%xmm6, %%xmm3 \n\t"
+ "paddw %%xmm5, %%xmm0 \n\t"
+ "paddw %%xmm5, %%xmm2 \n\t"
+ "paddw %%xmm0, %%xmm1 \n\t"
+ "paddw %%xmm2, %%xmm3 \n\t"
+ "movdqa %%xmm4, %%xmm0 \n\t"
+ "psrlw $6, %%xmm1 \n\t"
+ "psrlw $6, %%xmm3 \n\t"
+ AVG_OP("movq (%0), %%xmm2 \n\t")
+ AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
+ "packuswb %%xmm3, %%xmm1 \n\t"
+ AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
+ "movq %%xmm1, (%0)\n\t"
+ "movhps %%xmm1, (%0,%3)\n\t"
+ "sub $2, %2 \n\t"
+ "lea (%0,%3,2), %0 \n\t"
+ "jg 1b \n\t"
+ :"+r"(dst), "+r"(src), "+r"(h)
+ :"r"((x86_reg)stride)
+ );
+}
+
+static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+ __asm__ volatile(
+ "movd %0, %%mm7 \n\t"
+ "movd %1, %%mm6 \n\t"
+ "movq %2, %%mm5 \n\t"
+ "pshufw $0, %%mm7, %%mm7 \n\t"
+ "pshufw $0, %%mm6, %%mm6 \n\t"
+ :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
+ );
+
+ __asm__ volatile(
+ "movd (%1), %%mm0 \n\t"
+ "punpcklbw 1(%1), %%mm0 \n\t"
+ "add %3, %1 \n\t"
+ "1: \n\t"
+ "movd (%1), %%mm1 \n\t"
+ "movd (%1,%3), %%mm3 \n\t"
+ "punpcklbw 1(%1), %%mm1 \n\t"
+ "punpcklbw 1(%1,%3), %%mm3 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "pmaddubsw %%mm7, %%mm0 \n\t"
+ "pmaddubsw %%mm6, %%mm1 \n\t"
+ "pmaddubsw %%mm7, %%mm2 \n\t"
+ "pmaddubsw %%mm6, %%mm3 \n\t"
+ "paddw %%mm5, %%mm0 \n\t"
+ "paddw %%mm5, %%mm2 \n\t"
+ "paddw %%mm0, %%mm1 \n\t"
+ "paddw %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm0 \n\t"
+ "psrlw $6, %%mm1 \n\t"
+ "psrlw $6, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ "packuswb %%mm3, %%mm3 \n\t"
+ AVG_OP("pavgb (%0), %%mm1 \n\t")
+ AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
+ "movd %%mm1, (%0)\n\t"
+ "movd %%mm3, (%0,%3)\n\t"
+ "sub $2, %2 \n\t"
+ "lea (%0,%3,2), %0 \n\t"
+ "jg 1b \n\t"
+ :"+r"(dst), "+r"(src), "+r"(h)
+ :"r"((x86_reg)stride)
+ );
+}
+
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
new file mode 100644
index 0000000000..2ec8c3255d
--- /dev/null
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -0,0 +1,2976 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/h263.h"
+#include "libavcodec/mpegvideo.h"
+#include "libavcodec/simple_idct.h"
+#include "dsputil_mmx.h"
+#include "mmx.h"
+#include "vp3dsp_mmx.h"
+#include "vp3dsp_sse2.h"
+#include "idct_xvid.h"
+
+//#undef NDEBUG
+//#include <assert.h>
+
+int mm_flags; /* multimedia extension flags */
+
+/* pixel operations */
+DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
+
+DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
+{0x8000000080000000ULL, 0x8000000080000000ULL};
+
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
+DECLARE_ALIGNED_16(const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
+DECLARE_ALIGNED_16(const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
+DECLARE_ALIGNED_16(const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
+DECLARE_ALIGNED_16(const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
+DECLARE_ALIGNED_16(const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
+
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
+
+DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
+DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
+
+#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
+#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
+
+#define MOVQ_BFE(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
+ "paddb %%" #regd ", %%" #regd " \n\t" ::)
+
+#ifndef PIC
+#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
+#else
+// for shared library it's better to use this way for accessing constants
+// pcmpeqd -> -1
+#define MOVQ_BONE(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+ "psrlw $15, %%" #regd " \n\t" \
+ "packuswb %%" #regd ", %%" #regd " \n\t" ::)
+
+#define MOVQ_WTWO(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+ "psrlw $15, %%" #regd " \n\t" \
+ "psllw $1, %%" #regd " \n\t"::)
+
+#endif
+
+// using regr as temporary and for the output result
+// first argument is unmodifed and second is trashed
+// regfe is supposed to contain 0xfefefefefefefefe
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
+ "movq " #rega ", " #regr " \n\t"\
+ "pand " #regb ", " #regr " \n\t"\
+ "pxor " #rega ", " #regb " \n\t"\
+ "pand " #regfe "," #regb " \n\t"\
+ "psrlq $1, " #regb " \n\t"\
+ "paddb " #regb ", " #regr " \n\t"
+
+#define PAVGB_MMX(rega, regb, regr, regfe) \
+ "movq " #rega ", " #regr " \n\t"\
+ "por " #regb ", " #regr " \n\t"\
+ "pxor " #rega ", " #regb " \n\t"\
+ "pand " #regfe "," #regb " \n\t"\
+ "psrlq $1, " #regb " \n\t"\
+ "psubb " #regb ", " #regr " \n\t"
+
+// mm6 is supposed to contain 0xfefefefefefefefe
+#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
+ "movq " #rega ", " #regr " \n\t"\
+ "movq " #regc ", " #regp " \n\t"\
+ "pand " #regb ", " #regr " \n\t"\
+ "pand " #regd ", " #regp " \n\t"\
+ "pxor " #rega ", " #regb " \n\t"\
+ "pxor " #regc ", " #regd " \n\t"\
+ "pand %%mm6, " #regb " \n\t"\
+ "pand %%mm6, " #regd " \n\t"\
+ "psrlq $1, " #regb " \n\t"\
+ "psrlq $1, " #regd " \n\t"\
+ "paddb " #regb ", " #regr " \n\t"\
+ "paddb " #regd ", " #regp " \n\t"
+
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
+ "movq " #rega ", " #regr " \n\t"\
+ "movq " #regc ", " #regp " \n\t"\
+ "por " #regb ", " #regr " \n\t"\
+ "por " #regd ", " #regp " \n\t"\
+ "pxor " #rega ", " #regb " \n\t"\
+ "pxor " #regc ", " #regd " \n\t"\
+ "pand %%mm6, " #regb " \n\t"\
+ "pand %%mm6, " #regd " \n\t"\
+ "psrlq $1, " #regd " \n\t"\
+ "psrlq $1, " #regb " \n\t"\
+ "psubb " #regb ", " #regr " \n\t"\
+ "psubb " #regd ", " #regp " \n\t"
+
+/***********************************/
+/* MMX no rounding */
+#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
+#define SET_RND MOVQ_WONE
+#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
+
+#include "dsputil_mmx_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+/***********************************/
+/* MMX rounding */
+
+#define DEF(x, y) x ## _ ## y ##_mmx
+#define SET_RND MOVQ_WTWO
+#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
+
+#include "dsputil_mmx_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+
+/***********************************/
+/* 3Dnow specific */
+
+#define DEF(x) x ## _3dnow
+#define PAVGB "pavgusb"
+
+#include "dsputil_mmx_avg_template.c"
+
+#undef DEF
+#undef PAVGB
+
+/***********************************/
+/* MMX2 specific */
+
+#define DEF(x) x ## _mmx2
+
+/* Introduced only in MMX2 set */
+#define PAVGB "pavgb"
+
+#include "dsputil_mmx_avg_template.c"
+
+#undef DEF
+#undef PAVGB
+
+#define put_no_rnd_pixels16_mmx put_pixels16_mmx
+#define put_no_rnd_pixels8_mmx put_pixels8_mmx
+#define put_pixels16_mmx2 put_pixels16_mmx
+#define put_pixels8_mmx2 put_pixels8_mmx
+#define put_pixels4_mmx2 put_pixels4_mmx
+#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
+#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
+#define put_pixels16_3dnow put_pixels16_mmx
+#define put_pixels8_3dnow put_pixels8_mmx
+#define put_pixels4_3dnow put_pixels4_mmx
+#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
+#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
+
+/***********************************/
+/* standard MMX */
+
+void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+ const DCTELEM *p;
+ uint8_t *pix;
+
+ /* read the pixels */
+ p = block;
+ pix = pixels;
+ /* unrolled loop */
+ __asm__ volatile(
+ "movq %3, %%mm0 \n\t"
+ "movq 8%3, %%mm1 \n\t"
+ "movq 16%3, %%mm2 \n\t"
+ "movq 24%3, %%mm3 \n\t"
+ "movq 32%3, %%mm4 \n\t"
+ "movq 40%3, %%mm5 \n\t"
+ "movq 48%3, %%mm6 \n\t"
+ "movq 56%3, %%mm7 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "packuswb %%mm7, %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, (%0, %1) \n\t"
+ "movq %%mm4, (%0, %1, 2) \n\t"
+ "movq %%mm6, (%0, %2) \n\t"
+ ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
+ :"memory");
+ pix += line_size*4;
+ p += 32;
+
+ // if here would be an exact copy of the code above
+ // compiler would generate some very strange code
+ // thus using "r"
+ __asm__ volatile(
+ "movq (%3), %%mm0 \n\t"
+ "movq 8(%3), %%mm1 \n\t"
+ "movq 16(%3), %%mm2 \n\t"
+ "movq 24(%3), %%mm3 \n\t"
+ "movq 32(%3), %%mm4 \n\t"
+ "movq 40(%3), %%mm5 \n\t"
+ "movq 48(%3), %%mm6 \n\t"
+ "movq 56(%3), %%mm7 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "packuswb %%mm7, %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, (%0, %1) \n\t"
+ "movq %%mm4, (%0, %1, 2) \n\t"
+ "movq %%mm6, (%0, %2) \n\t"
+ ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
+ :"memory");
+}
+
+static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
+ { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+
+void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+ int i;
+
+ movq_m2r(*vector128, mm1);
+ for (i = 0; i < 8; i++) {
+ movq_m2r(*(block), mm0);
+ packsswb_m2r(*(block + 4), mm0);
+ block += 8;
+ paddb_r2r(mm1, mm0);
+ movq_r2m(mm0, *pixels);
+ pixels += line_size;
+ }
+}
+
+void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+ const DCTELEM *p;
+ uint8_t *pix;
+ int i;
+
+ /* read the pixels */
+ p = block;
+ pix = pixels;
+ MOVQ_ZERO(mm7);
+ i = 4;
+ do {
+ __asm__ volatile(
+ "movq (%2), %%mm0 \n\t"
+ "movq 8(%2), %%mm1 \n\t"
+ "movq 16(%2), %%mm2 \n\t"
+ "movq 24(%2), %%mm3 \n\t"
+ "movq %0, %%mm4 \n\t"
+ "movq %1, %%mm6 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddsw %%mm4, %%mm0 \n\t"
+ "paddsw %%mm5, %%mm1 \n\t"
+ "movq %%mm6, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm6 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddsw %%mm6, %%mm2 \n\t"
+ "paddsw %%mm5, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %0 \n\t"
+ "movq %%mm2, %1 \n\t"
+ :"+m"(*pix), "+m"(*(pix+line_size))
+ :"r"(p)
+ :"memory");
+ pix += line_size*2;
+ p += 16;
+ } while (--i);
+}
+
+static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd (%1, %3), %%mm1 \n\t"
+ "movd %%mm0, (%2) \n\t"
+ "movd %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd (%1, %3), %%mm1 \n\t"
+ "movd %%mm0, (%2) \n\t"
+ "movd %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ : "+g"(h), "+r" (pixels), "+r" (block)
+ : "r"((x86_reg)line_size)
+ : "%"REG_a, "memory"
+ );
+}
+
+static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ : "+g"(h), "+r" (pixels), "+r" (block)
+ : "r"((x86_reg)line_size)
+ : "%"REG_a, "memory"
+ );
+}
+
+static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm4 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm5 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm4 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm5 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ : "+g"(h), "+r" (pixels), "+r" (block)
+ : "r"((x86_reg)line_size)
+ : "%"REG_a, "memory"
+ );
+}
+
+static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "1: \n\t"
+ "movdqu (%1), %%xmm0 \n\t"
+ "movdqu (%1,%3), %%xmm1 \n\t"
+ "movdqu (%1,%3,2), %%xmm2 \n\t"
+ "movdqu (%1,%4), %%xmm3 \n\t"
+ "movdqa %%xmm0, (%2) \n\t"
+ "movdqa %%xmm1, (%2,%3) \n\t"
+ "movdqa %%xmm2, (%2,%3,2) \n\t"
+ "movdqa %%xmm3, (%2,%4) \n\t"
+ "subl $4, %0 \n\t"
+ "lea (%1,%3,4), %1 \n\t"
+ "lea (%2,%3,4), %2 \n\t"
+ "jnz 1b \n\t"
+ : "+g"(h), "+r" (pixels), "+r" (block)
+ : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+ : "memory"
+ );
+}
+
+static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "1: \n\t"
+ "movdqu (%1), %%xmm0 \n\t"
+ "movdqu (%1,%3), %%xmm1 \n\t"
+ "movdqu (%1,%3,2), %%xmm2 \n\t"
+ "movdqu (%1,%4), %%xmm3 \n\t"
+ "pavgb (%2), %%xmm0 \n\t"
+ "pavgb (%2,%3), %%xmm1 \n\t"
+ "pavgb (%2,%3,2), %%xmm2 \n\t"
+ "pavgb (%2,%4), %%xmm3 \n\t"
+ "movdqa %%xmm0, (%2) \n\t"
+ "movdqa %%xmm1, (%2,%3) \n\t"
+ "movdqa %%xmm2, (%2,%3,2) \n\t"
+ "movdqa %%xmm3, (%2,%4) \n\t"
+ "subl $4, %0 \n\t"
+ "lea (%1,%3,4), %1 \n\t"
+ "lea (%2,%3,4), %2 \n\t"
+ "jnz 1b \n\t"
+ : "+g"(h), "+r" (pixels), "+r" (block)
+ : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+ : "memory"
+ );
+}
+
+#define CLEAR_BLOCKS(name,n) \
+static void name(DCTELEM *blocks)\
+{\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "mov %1, %%"REG_a" \n\t"\
+ "1: \n\t"\
+ "movq %%mm7, (%0, %%"REG_a") \n\t"\
+ "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
+ "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
+ "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
+ "add $32, %%"REG_a" \n\t"\
+ " js 1b \n\t"\
+ : : "r" (((uint8_t *)blocks)+128*n),\
+ "i" (-128*n)\
+ : "%"REG_a\
+ );\
+}
+CLEAR_BLOCKS(clear_blocks_mmx, 6)
+CLEAR_BLOCKS(clear_block_mmx, 1)
+
+static void clear_block_sse(DCTELEM *block)
+{
+ __asm__ volatile(
+ "xorps %%xmm0, %%xmm0 \n"
+ "movaps %%xmm0, (%0) \n"
+ "movaps %%xmm0, 16(%0) \n"
+ "movaps %%xmm0, 32(%0) \n"
+ "movaps %%xmm0, 48(%0) \n"
+ "movaps %%xmm0, 64(%0) \n"
+ "movaps %%xmm0, 80(%0) \n"
+ "movaps %%xmm0, 96(%0) \n"
+ "movaps %%xmm0, 112(%0) \n"
+ :: "r"(block)
+ : "memory"
+ );
+}
+
+static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
+ x86_reg i=0;
+ __asm__ volatile(
+ "jmp 2f \n\t"
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq (%2, %0), %%mm1 \n\t"
+ "paddb %%mm0, %%mm1 \n\t"
+ "movq %%mm1, (%2, %0) \n\t"
+ "movq 8(%1, %0), %%mm0 \n\t"
+ "movq 8(%2, %0), %%mm1 \n\t"
+ "paddb %%mm0, %%mm1 \n\t"
+ "movq %%mm1, 8(%2, %0) \n\t"
+ "add $16, %0 \n\t"
+ "2: \n\t"
+ "cmp %3, %0 \n\t"
+ " js 1b \n\t"
+ : "+r" (i)
+ : "r"(src), "r"(dst), "r"((x86_reg)w-15)
+ );
+ for(; i<w; i++)
+ dst[i+0] += src[i+0];
+}
+
+static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
+ x86_reg i=0;
+ __asm__ volatile(
+ "jmp 2f \n\t"
+ "1: \n\t"
+ "movq (%2, %0), %%mm0 \n\t"
+ "movq 8(%2, %0), %%mm1 \n\t"
+ "paddb (%3, %0), %%mm0 \n\t"
+ "paddb 8(%3, %0), %%mm1 \n\t"
+ "movq %%mm0, (%1, %0) \n\t"
+ "movq %%mm1, 8(%1, %0) \n\t"
+ "add $16, %0 \n\t"
+ "2: \n\t"
+ "cmp %4, %0 \n\t"
+ " js 1b \n\t"
+ : "+r" (i)
+ : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
+ );
+ for(; i<w; i++)
+ dst[i] = src1[i] + src2[i];
+}
+
+#define H263_LOOP_FILTER \
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %0, %%mm0 \n\t"\
+ "movq %0, %%mm1 \n\t"\
+ "movq %3, %%mm2 \n\t"\
+ "movq %3, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "psubw %%mm2, %%mm0 \n\t"\
+ "psubw %%mm3, %%mm1 \n\t"\
+ "movq %1, %%mm2 \n\t"\
+ "movq %1, %%mm3 \n\t"\
+ "movq %2, %%mm4 \n\t"\
+ "movq %2, %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm7, %%mm5 \n\t"\
+ "psubw %%mm2, %%mm4 \n\t"\
+ "psubw %%mm3, %%mm5 \n\t"\
+ "psllw $2, %%mm4 \n\t"\
+ "psllw $2, %%mm5 \n\t"\
+ "paddw %%mm0, %%mm4 \n\t"\
+ "paddw %%mm1, %%mm5 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "pcmpgtw %%mm4, %%mm6 \n\t"\
+ "pcmpgtw %%mm5, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm4 \n\t"\
+ "pxor %%mm7, %%mm5 \n\t"\
+ "psubw %%mm6, %%mm4 \n\t"\
+ "psubw %%mm7, %%mm5 \n\t"\
+ "psrlw $3, %%mm4 \n\t"\
+ "psrlw $3, %%mm5 \n\t"\
+ "packuswb %%mm5, %%mm4 \n\t"\
+ "packsswb %%mm7, %%mm6 \n\t"\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd %4, %%mm2 \n\t"\
+ "punpcklbw %%mm2, %%mm2 \n\t"\
+ "punpcklbw %%mm2, %%mm2 \n\t"\
+ "punpcklbw %%mm2, %%mm2 \n\t"\
+ "psubusb %%mm4, %%mm2 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "psubusb %%mm4, %%mm3 \n\t"\
+ "psubb %%mm3, %%mm2 \n\t"\
+ "movq %1, %%mm3 \n\t"\
+ "movq %2, %%mm4 \n\t"\
+ "pxor %%mm6, %%mm3 \n\t"\
+ "pxor %%mm6, %%mm4 \n\t"\
+ "paddusb %%mm2, %%mm3 \n\t"\
+ "psubusb %%mm2, %%mm4 \n\t"\
+ "pxor %%mm6, %%mm3 \n\t"\
+ "pxor %%mm6, %%mm4 \n\t"\
+ "paddusb %%mm2, %%mm2 \n\t"\
+ "packsswb %%mm1, %%mm0 \n\t"\
+ "pcmpgtb %%mm0, %%mm7 \n\t"\
+ "pxor %%mm7, %%mm0 \n\t"\
+ "psubb %%mm7, %%mm0 \n\t"\
+ "movq %%mm0, %%mm1 \n\t"\
+ "psubusb %%mm2, %%mm0 \n\t"\
+ "psubb %%mm0, %%mm1 \n\t"\
+ "pand %5, %%mm1 \n\t"\
+ "psrlw $2, %%mm1 \n\t"\
+ "pxor %%mm7, %%mm1 \n\t"\
+ "psubb %%mm7, %%mm1 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ "movq %3, %%mm6 \n\t"\
+ "psubb %%mm1, %%mm5 \n\t"\
+ "paddb %%mm1, %%mm6 \n\t"
+
+static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
+ if(ENABLE_ANY_H263) {
+ const int strength= ff_h263_loop_filter_strength[qscale];
+
+ __asm__ volatile(
+
+ H263_LOOP_FILTER
+
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %0 \n\t"
+ "movq %%mm6, %3 \n\t"
+ : "+m" (*(uint64_t*)(src - 2*stride)),
+ "+m" (*(uint64_t*)(src - 1*stride)),
+ "+m" (*(uint64_t*)(src + 0*stride)),
+ "+m" (*(uint64_t*)(src + 1*stride))
+ : "g" (2*strength), "m"(ff_pb_FC)
+ );
+ }
+}
+
+static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+ __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
+ "movd %4, %%mm0 \n\t"
+ "movd %5, %%mm1 \n\t"
+ "movd %6, %%mm2 \n\t"
+ "movd %7, %%mm3 \n\t"
+ "punpcklbw %%mm1, %%mm0 \n\t"
+ "punpcklbw %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "punpcklwd %%mm2, %%mm0 \n\t"
+ "punpckhwd %%mm2, %%mm1 \n\t"
+ "movd %%mm0, %0 \n\t"
+ "punpckhdq %%mm0, %%mm0 \n\t"
+ "movd %%mm0, %1 \n\t"
+ "movd %%mm1, %2 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movd %%mm1, %3 \n\t"
+
+ : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
+ "=m" (*(uint32_t*)(dst + 1*dst_stride)),
+ "=m" (*(uint32_t*)(dst + 2*dst_stride)),
+ "=m" (*(uint32_t*)(dst + 3*dst_stride))
+ : "m" (*(uint32_t*)(src + 0*src_stride)),
+ "m" (*(uint32_t*)(src + 1*src_stride)),
+ "m" (*(uint32_t*)(src + 2*src_stride)),
+ "m" (*(uint32_t*)(src + 3*src_stride))
+ );
+}
+
+static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
+ if(ENABLE_ANY_H263) {
+ const int strength= ff_h263_loop_filter_strength[qscale];
+ DECLARE_ALIGNED(8, uint64_t, temp[4]);
+ uint8_t *btemp= (uint8_t*)temp;
+
+ src -= 2;
+
+ transpose4x4(btemp , src , 8, stride);
+ transpose4x4(btemp+4, src + 4*stride, 8, stride);
+ __asm__ volatile(
+ H263_LOOP_FILTER // 5 3 4 6
+
+ : "+m" (temp[0]),
+ "+m" (temp[1]),
+ "+m" (temp[2]),
+ "+m" (temp[3])
+ : "g" (2*strength), "m"(ff_pb_FC)
+ );
+
+ __asm__ volatile(
+ "movq %%mm5, %%mm1 \n\t"
+ "movq %%mm4, %%mm0 \n\t"
+ "punpcklbw %%mm3, %%mm5 \n\t"
+ "punpcklbw %%mm6, %%mm4 \n\t"
+ "punpckhbw %%mm3, %%mm1 \n\t"
+ "punpckhbw %%mm6, %%mm0 \n\t"
+ "movq %%mm5, %%mm3 \n\t"
+ "movq %%mm1, %%mm6 \n\t"
+ "punpcklwd %%mm4, %%mm5 \n\t"
+ "punpcklwd %%mm0, %%mm1 \n\t"
+ "punpckhwd %%mm4, %%mm3 \n\t"
+ "punpckhwd %%mm0, %%mm6 \n\t"
+ "movd %%mm5, (%0) \n\t"
+ "punpckhdq %%mm5, %%mm5 \n\t"
+ "movd %%mm5, (%0,%2) \n\t"
+ "movd %%mm3, (%0,%2,2) \n\t"
+ "punpckhdq %%mm3, %%mm3 \n\t"
+ "movd %%mm3, (%0,%3) \n\t"
+ "movd %%mm1, (%1) \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movd %%mm1, (%1,%2) \n\t"
+ "movd %%mm6, (%1,%2,2) \n\t"
+ "punpckhdq %%mm6, %%mm6 \n\t"
+ "movd %%mm6, (%1,%3) \n\t"
+ :: "r" (src),
+ "r" (src + 4*stride),
+ "r" ((x86_reg) stride ),
+ "r" ((x86_reg)(3*stride))
+ );
+ }
+}
+
+/* draw the edges of width 'w' of an image of size width, height
+ this mmx version can only handle w==8 || w==16 */
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
+{
+ uint8_t *ptr, *last_line;
+ int i;
+
+ last_line = buf + (height - 1) * wrap;
+ /* left and right */
+ ptr = buf;
+ if(w==8)
+ {
+ __asm__ volatile(
+ "1: \n\t"
+ "movd (%0), %%mm0 \n\t"
+ "punpcklbw %%mm0, %%mm0 \n\t"
+ "punpcklwd %%mm0, %%mm0 \n\t"
+ "punpckldq %%mm0, %%mm0 \n\t"
+ "movq %%mm0, -8(%0) \n\t"
+ "movq -8(%0, %2), %%mm1 \n\t"
+ "punpckhbw %%mm1, %%mm1 \n\t"
+ "punpckhwd %%mm1, %%mm1 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movq %%mm1, (%0, %2) \n\t"
+ "add %1, %0 \n\t"
+ "cmp %3, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (ptr)
+ : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
+ );
+ }
+ else
+ {
+ __asm__ volatile(
+ "1: \n\t"
+ "movd (%0), %%mm0 \n\t"
+ "punpcklbw %%mm0, %%mm0 \n\t"
+ "punpcklwd %%mm0, %%mm0 \n\t"
+ "punpckldq %%mm0, %%mm0 \n\t"
+ "movq %%mm0, -8(%0) \n\t"
+ "movq %%mm0, -16(%0) \n\t"
+ "movq -8(%0, %2), %%mm1 \n\t"
+ "punpckhbw %%mm1, %%mm1 \n\t"
+ "punpckhwd %%mm1, %%mm1 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movq %%mm1, (%0, %2) \n\t"
+ "movq %%mm1, 8(%0, %2) \n\t"
+ "add %1, %0 \n\t"
+ "cmp %3, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (ptr)
+ : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
+ );
+ }
+
+ for(i=0;i<w;i+=4) {
+ /* top and bottom (and hopefully also the corners) */
+ ptr= buf - (i + 1) * wrap - w;
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm0, (%0, %2) \n\t"
+ "movq %%mm0, (%0, %2, 2) \n\t"
+ "movq %%mm0, (%0, %3) \n\t"
+ "add $8, %0 \n\t"
+ "cmp %4, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (ptr)
+ : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
+ );
+ ptr= last_line + (i + 1) * wrap - w;
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm0, (%0, %2) \n\t"
+ "movq %%mm0, (%0, %2, 2) \n\t"
+ "movq %%mm0, (%0, %3) \n\t"
+ "add $8, %0 \n\t"
+ "cmp %4, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (ptr)
+ : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
+ );
+ }
+}
+
+#define PAETH(cpu, abs3)\
+static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
+{\
+ x86_reg i = -bpp;\
+ x86_reg end = w-3;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n"\
+ "movd (%1,%0), %%mm0 \n"\
+ "movd (%2,%0), %%mm1 \n"\
+ "punpcklbw %%mm7, %%mm0 \n"\
+ "punpcklbw %%mm7, %%mm1 \n"\
+ "add %4, %0 \n"\
+ "1: \n"\
+ "movq %%mm1, %%mm2 \n"\
+ "movd (%2,%0), %%mm1 \n"\
+ "movq %%mm2, %%mm3 \n"\
+ "punpcklbw %%mm7, %%mm1 \n"\
+ "movq %%mm2, %%mm4 \n"\
+ "psubw %%mm1, %%mm3 \n"\
+ "psubw %%mm0, %%mm4 \n"\
+ "movq %%mm3, %%mm5 \n"\
+ "paddw %%mm4, %%mm5 \n"\
+ abs3\
+ "movq %%mm4, %%mm6 \n"\
+ "pminsw %%mm5, %%mm6 \n"\
+ "pcmpgtw %%mm6, %%mm3 \n"\
+ "pcmpgtw %%mm5, %%mm4 \n"\
+ "movq %%mm4, %%mm6 \n"\
+ "pand %%mm3, %%mm4 \n"\
+ "pandn %%mm3, %%mm6 \n"\
+ "pandn %%mm0, %%mm3 \n"\
+ "movd (%3,%0), %%mm0 \n"\
+ "pand %%mm1, %%mm6 \n"\
+ "pand %%mm4, %%mm2 \n"\
+ "punpcklbw %%mm7, %%mm0 \n"\
+ "movq %6, %%mm5 \n"\
+ "paddw %%mm6, %%mm0 \n"\
+ "paddw %%mm2, %%mm3 \n"\
+ "paddw %%mm3, %%mm0 \n"\
+ "pand %%mm5, %%mm0 \n"\
+ "movq %%mm0, %%mm3 \n"\
+ "packuswb %%mm3, %%mm3 \n"\
+ "movd %%mm3, (%1,%0) \n"\
+ "add %4, %0 \n"\
+ "cmp %5, %0 \n"\
+ "jle 1b \n"\
+ :"+r"(i)\
+ :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
+ "m"(ff_pw_255)\
+ :"memory"\
+ );\
+}
+
+#define ABS3_MMX2\
+ "psubw %%mm5, %%mm7 \n"\
+ "pmaxsw %%mm7, %%mm5 \n"\
+ "pxor %%mm6, %%mm6 \n"\
+ "pxor %%mm7, %%mm7 \n"\
+ "psubw %%mm3, %%mm6 \n"\
+ "psubw %%mm4, %%mm7 \n"\
+ "pmaxsw %%mm6, %%mm3 \n"\
+ "pmaxsw %%mm7, %%mm4 \n"\
+ "pxor %%mm7, %%mm7 \n"
+
+#define ABS3_SSSE3\
+ "pabsw %%mm3, %%mm3 \n"\
+ "pabsw %%mm4, %%mm4 \n"\
+ "pabsw %%mm5, %%mm5 \n"
+
+PAETH(mmx2, ABS3_MMX2)
+#ifdef HAVE_SSSE3
+PAETH(ssse3, ABS3_SSSE3)
+#endif
+
+#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
+ "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
+ "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
+ "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
+ "movq "#in7", " #m3 " \n\t" /* d */\
+ "movq "#in0", %%mm5 \n\t" /* D */\
+ "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
+ "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
+ "movq "#in1", %%mm5 \n\t" /* C */\
+ "movq "#in2", %%mm6 \n\t" /* B */\
+ "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
+ "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
+ "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
+ "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
+ "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
+ "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
+ "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
+ "psraw $5, %%mm5 \n\t"\
+ "packuswb %%mm5, %%mm5 \n\t"\
+ OP(%%mm5, out, %%mm7, d)
+
+#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
+static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ uint64_t temp;\
+\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
+ "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
+ "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
+ "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
+ "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
+ "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
+ "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
+ "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
+ "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
+ "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
+ "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
+ "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
+ "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
+ "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
+ "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
+ "paddw %%mm3, %%mm5 \n\t" /* b */\
+ "paddw %%mm2, %%mm6 \n\t" /* c */\
+ "paddw %%mm5, %%mm5 \n\t" /* 2b */\
+ "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
+ "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
+ "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
+ "paddw %%mm4, %%mm0 \n\t" /* a */\
+ "paddw %%mm1, %%mm5 \n\t" /* d */\
+ "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
+ "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
+ "paddw %6, %%mm6 \n\t"\
+ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
+ "psraw $5, %%mm0 \n\t"\
+ "movq %%mm0, %5 \n\t"\
+ /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
+ \
+ "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
+ "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
+ "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
+ "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
+ "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
+ "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
+ "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
+ "paddw %%mm0, %%mm2 \n\t" /* b */\
+ "paddw %%mm5, %%mm3 \n\t" /* c */\
+ "paddw %%mm2, %%mm2 \n\t" /* 2b */\
+ "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
+ "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
+ "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
+ "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
+ "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
+ "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
+ "paddw %%mm2, %%mm1 \n\t" /* a */\
+ "paddw %%mm6, %%mm4 \n\t" /* d */\
+ "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
+ "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
+ "paddw %6, %%mm1 \n\t"\
+ "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
+ "psraw $5, %%mm3 \n\t"\
+ "movq %5, %%mm1 \n\t"\
+ "packuswb %%mm3, %%mm1 \n\t"\
+ OP_MMX2(%%mm1, (%1),%%mm4, q)\
+ /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
+ \
+ "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
+ "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
+ "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
+ "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
+ "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
+ "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
+ "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
+ "paddw %%mm1, %%mm5 \n\t" /* b */\
+ "paddw %%mm4, %%mm0 \n\t" /* c */\
+ "paddw %%mm5, %%mm5 \n\t" /* 2b */\
+ "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
+ "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
+ "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
+ "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
+ "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
+ "paddw %%mm3, %%mm2 \n\t" /* d */\
+ "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
+ "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
+ "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
+ "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
+ "paddw %%mm2, %%mm6 \n\t" /* a */\
+ "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
+ "paddw %6, %%mm0 \n\t"\
+ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
+ "psraw $5, %%mm0 \n\t"\
+ /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
+ \
+ "paddw %%mm5, %%mm3 \n\t" /* a */\
+ "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
+ "paddw %%mm4, %%mm6 \n\t" /* b */\
+ "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
+ "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
+ "paddw %%mm1, %%mm4 \n\t" /* c */\
+ "paddw %%mm2, %%mm5 \n\t" /* d */\
+ "paddw %%mm6, %%mm6 \n\t" /* 2b */\
+ "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
+ "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
+ "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
+ "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
+ "paddw %6, %%mm4 \n\t"\
+ "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
+ "psraw $5, %%mm4 \n\t"\
+ "packuswb %%mm4, %%mm0 \n\t"\
+ OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
+ \
+ "add %3, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+D"(h)\
+ : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
+ : "memory"\
+ );\
+}\
+\
+static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ int i;\
+ int16_t temp[16];\
+ /* quick HACK, XXX FIXME MUST be optimized */\
+ for(i=0; i<h; i++)\
+ {\
+ temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
+ temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
+ temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
+ temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
+ temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
+ temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
+ temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
+ temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
+ temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
+ temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
+ temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
+ temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
+ temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
+ temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
+ temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
+ temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
+ __asm__ volatile(\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 8(%0), %%mm1 \n\t"\
+ "paddw %2, %%mm0 \n\t"\
+ "paddw %2, %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ OP_3DNOW(%%mm0, (%1), %%mm1, q)\
+ "movq 16(%0), %%mm0 \n\t"\
+ "movq 24(%0), %%mm1 \n\t"\
+ "paddw %2, %%mm0 \n\t"\
+ "paddw %2, %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
+ :: "r"(temp), "r"(dst), "m"(ROUNDER)\
+ : "memory"\
+ );\
+ dst+=dstStride;\
+ src+=srcStride;\
+ }\
+}\
+\
+static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
+ "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
+ "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
+ "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
+ "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
+ "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
+ "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
+ "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
+ "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
+ "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
+ "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
+ "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
+ "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
+ "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
+ "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
+ "paddw %%mm3, %%mm5 \n\t" /* b */\
+ "paddw %%mm2, %%mm6 \n\t" /* c */\
+ "paddw %%mm5, %%mm5 \n\t" /* 2b */\
+ "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
+ "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
+ "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
+ "paddw %%mm4, %%mm0 \n\t" /* a */\
+ "paddw %%mm1, %%mm5 \n\t" /* d */\
+ "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
+ "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
+ "paddw %5, %%mm6 \n\t"\
+ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
+ "psraw $5, %%mm0 \n\t"\
+ /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
+ \
+ "movd 5(%0), %%mm5 \n\t" /* FGHI */\
+ "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
+ "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
+ "paddw %%mm5, %%mm1 \n\t" /* a */\
+ "paddw %%mm6, %%mm2 \n\t" /* b */\
+ "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
+ "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
+ "paddw %%mm6, %%mm3 \n\t" /* c */\
+ "paddw %%mm5, %%mm4 \n\t" /* d */\
+ "paddw %%mm2, %%mm2 \n\t" /* 2b */\
+ "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
+ "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
+ "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
+ "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
+ "paddw %5, %%mm1 \n\t"\
+ "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
+ "psraw $5, %%mm3 \n\t"\
+ "packuswb %%mm3, %%mm0 \n\t"\
+ OP_MMX2(%%mm0, (%1), %%mm4, q)\
+ \
+ "add %3, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+d"(h)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
+ : "memory"\
+ );\
+}\
+\
+static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ int i;\
+ int16_t temp[8];\
+ /* quick HACK, XXX FIXME MUST be optimized */\
+ for(i=0; i<h; i++)\
+ {\
+ temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
+ temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
+ temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
+ temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
+ temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
+ temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
+ temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
+ temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
+ __asm__ volatile(\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 8(%0), %%mm1 \n\t"\
+ "paddw %2, %%mm0 \n\t"\
+ "paddw %2, %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ OP_3DNOW(%%mm0, (%1), %%mm1, q)\
+ :: "r"(temp), "r"(dst), "m"(ROUNDER)\
+ :"memory"\
+ );\
+ dst+=dstStride;\
+ src+=srcStride;\
+ }\
+}
+
+#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
+\
+static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ uint64_t temp[17*4];\
+ uint64_t *temp_ptr= temp;\
+ int count= 17;\
+\
+ /*FIXME unroll */\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "movq (%0), %%mm1 \n\t"\
+ "movq 8(%0), %%mm2 \n\t"\
+ "movq 8(%0), %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "movq %%mm0, (%1) \n\t"\
+ "movq %%mm1, 17*8(%1) \n\t"\
+ "movq %%mm2, 2*17*8(%1) \n\t"\
+ "movq %%mm3, 3*17*8(%1) \n\t"\
+ "add $8, %1 \n\t"\
+ "add %3, %0 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+r" (src), "+r" (temp_ptr), "+r"(count)\
+ : "r" ((x86_reg)srcStride)\
+ : "memory"\
+ );\
+ \
+ temp_ptr= temp;\
+ count=4;\
+ \
+/*FIXME reorder for speed */\
+ __asm__ volatile(\
+ /*"pxor %%mm7, %%mm7 \n\t"*/\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 8(%0), %%mm1 \n\t"\
+ "movq 16(%0), %%mm2 \n\t"\
+ "movq 24(%0), %%mm3 \n\t"\
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
+ "add %4, %1 \n\t"\
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
+ \
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
+ "add %4, %1 \n\t"\
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
+ "add %4, %1 \n\t"\
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
+ "add %4, %1 \n\t"\
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
+ "add %4, %1 \n\t"\
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
+ "add %4, %1 \n\t"\
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
+ \
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
+ "add %4, %1 \n\t" \
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
+ \
+ "add $136, %0 \n\t"\
+ "add %6, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ \
+ : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
+ : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
+ :"memory"\
+ );\
+}\
+\
+static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ uint64_t temp[9*2];\
+ uint64_t *temp_ptr= temp;\
+ int count= 9;\
+\
+ /*FIXME unroll */\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "movq (%0), %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "movq %%mm0, (%1) \n\t"\
+ "movq %%mm1, 9*8(%1) \n\t"\
+ "add $8, %1 \n\t"\
+ "add %3, %0 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+r" (src), "+r" (temp_ptr), "+r"(count)\
+ : "r" ((x86_reg)srcStride)\
+ : "memory"\
+ );\
+ \
+ temp_ptr= temp;\
+ count=2;\
+ \
+/*FIXME reorder for speed */\
+ __asm__ volatile(\
+ /*"pxor %%mm7, %%mm7 \n\t"*/\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 8(%0), %%mm1 \n\t"\
+ "movq 16(%0), %%mm2 \n\t"\
+ "movq 24(%0), %%mm3 \n\t"\
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
+ "add %4, %1 \n\t"\
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
+ \
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
+ "add %4, %1 \n\t"\
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
+ \
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
+ "add %4, %1 \n\t"\
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
+ \
+ "add $72, %0 \n\t"\
+ "add %6, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ \
+ : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
+ : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
+ : "memory"\
+ );\
+}\
+\
+static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t temp[8];\
+ uint8_t * const half= (uint8_t*)temp;\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
+ OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t temp[8];\
+ uint8_t * const half= (uint8_t*)temp;\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
+ OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t temp[8];\
+ uint8_t * const half= (uint8_t*)temp;\
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
+ OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
+}\
+\
+static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t temp[8];\
+ uint8_t * const half= (uint8_t*)temp;\
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
+ OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[8 + 9];\
+ uint8_t * const halfH= ((uint8_t*)half) + 64;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[8 + 9];\
+ uint8_t * const halfH= ((uint8_t*)half) + 64;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[8 + 9];\
+ uint8_t * const halfH= ((uint8_t*)half) + 64;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[8 + 9];\
+ uint8_t * const halfH= ((uint8_t*)half) + 64;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[8 + 9];\
+ uint8_t * const halfH= ((uint8_t*)half) + 64;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[8 + 9];\
+ uint8_t * const halfH= ((uint8_t*)half) + 64;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
+}\
+static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[8 + 9];\
+ uint8_t * const halfH= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
+ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[8 + 9];\
+ uint8_t * const halfH= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
+ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[9];\
+ uint8_t * const halfH= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
+ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
+}\
+static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t temp[32];\
+ uint8_t * const half= (uint8_t*)temp;\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
+ OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t temp[32];\
+ uint8_t * const half= (uint8_t*)temp;\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
+ OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t temp[32];\
+ uint8_t * const half= (uint8_t*)temp;\
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
+ OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
+}\
+\
+static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t temp[32];\
+ uint8_t * const half= (uint8_t*)temp;\
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
+ OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[16*2 + 17*2];\
+ uint8_t * const halfH= ((uint8_t*)half) + 256;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[16*2 + 17*2];\
+ uint8_t * const halfH= ((uint8_t*)half) + 256;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[16*2 + 17*2];\
+ uint8_t * const halfH= ((uint8_t*)half) + 256;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[16*2 + 17*2];\
+ uint8_t * const halfH= ((uint8_t*)half) + 256;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[16*2 + 17*2];\
+ uint8_t * const halfH= ((uint8_t*)half) + 256;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[16*2 + 17*2];\
+ uint8_t * const halfH= ((uint8_t*)half) + 256;\
+ uint8_t * const halfHV= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
+}\
+static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[17*2];\
+ uint8_t * const halfH= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
+ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[17*2];\
+ uint8_t * const halfH= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
+ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}\
+static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ uint64_t half[17*2];\
+ uint8_t * const halfH= ((uint8_t*)half);\
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
+ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
+}
+
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp " \n\t"\
+"pavgusb " #temp ", " #a " \n\t"\
+"mov" #size " " #a ", " #b " \n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp " \n\t"\
+"pavgb " #temp ", " #a " \n\t"\
+"mov" #size " " #a ", " #b " \n\t"
+
+QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
+QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
+QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
+QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
+QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
+QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
+QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
+
+/***********************************/
+/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
+
+#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
+}
+#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
+}
+
+#define QPEL_2TAP(OPNAME, SIZE, MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
+ OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
+ OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
+ OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
+}\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
+}\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
+
+QPEL_2TAP(put_, 16, mmx2)
+QPEL_2TAP(avg_, 16, mmx2)
+QPEL_2TAP(put_, 8, mmx2)
+QPEL_2TAP(avg_, 8, mmx2)
+QPEL_2TAP(put_, 16, 3dnow)
+QPEL_2TAP(avg_, 16, 3dnow)
+QPEL_2TAP(put_, 8, 3dnow)
+QPEL_2TAP(avg_, 8, 3dnow)
+
+
+#if 0
+static void just_return() { return; }
+#endif
+
+static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
+ const int w = 8;
+ const int ix = ox>>(16+shift);
+ const int iy = oy>>(16+shift);
+ const int oxs = ox>>4;
+ const int oys = oy>>4;
+ const int dxxs = dxx>>4;
+ const int dxys = dxy>>4;
+ const int dyxs = dyx>>4;
+ const int dyys = dyy>>4;
+ const uint16_t r4[4] = {r,r,r,r};
+ const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
+ const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
+ const uint64_t shift2 = 2*shift;
+ uint8_t edge_buf[(h+1)*stride];
+ int x, y;
+
+ const int dxw = (dxx-(1<<(16+shift)))*(w-1);
+ const int dyh = (dyy-(1<<(16+shift)))*(h-1);
+ const int dxh = dxy*(h-1);
+ const int dyw = dyx*(w-1);
+ if( // non-constant fullpel offset (3% of blocks)
+ ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
+ (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
+ // uses more than 16 bits of subpel mv (only at huge resolution)
+ || (dxx|dxy|dyx|dyy)&15 )
+ {
+ //FIXME could still use mmx for some of the rows
+ ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
+ return;
+ }
+
+ src += ix + iy*stride;
+ if( (unsigned)ix >= width-w ||
+ (unsigned)iy >= height-h )
+ {
+ ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
+ src = edge_buf;
+ }
+
+ __asm__ volatile(
+ "movd %0, %%mm6 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ "punpcklwd %%mm6, %%mm6 \n\t"
+ "punpcklwd %%mm6, %%mm6 \n\t"
+ :: "r"(1<<shift)
+ );
+
+ for(x=0; x<w; x+=4){
+ uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
+ oxs - dxys + dxxs*(x+1),
+ oxs - dxys + dxxs*(x+2),
+ oxs - dxys + dxxs*(x+3) };
+ uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
+ oys - dyys + dyxs*(x+1),
+ oys - dyys + dyxs*(x+2),
+ oys - dyys + dyxs*(x+3) };
+
+ for(y=0; y<h; y++){
+ __asm__ volatile(
+ "movq %0, %%mm4 \n\t"
+ "movq %1, %%mm5 \n\t"
+ "paddw %2, %%mm4 \n\t"
+ "paddw %3, %%mm5 \n\t"
+ "movq %%mm4, %0 \n\t"
+ "movq %%mm5, %1 \n\t"
+ "psrlw $12, %%mm4 \n\t"
+ "psrlw $12, %%mm5 \n\t"
+ : "+m"(*dx4), "+m"(*dy4)
+ : "m"(*dxy4), "m"(*dyy4)
+ );
+
+ __asm__ volatile(
+ "movq %%mm6, %%mm2 \n\t"
+ "movq %%mm6, %%mm1 \n\t"
+ "psubw %%mm4, %%mm2 \n\t"
+ "psubw %%mm5, %%mm1 \n\t"
+ "movq %%mm2, %%mm0 \n\t"
+ "movq %%mm4, %%mm3 \n\t"
+ "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
+ "pmullw %%mm5, %%mm3 \n\t" // dx*dy
+ "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
+ "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
+
+ "movd %4, %%mm5 \n\t"
+ "movd %3, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
+ "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
+
+ "movd %2, %%mm5 \n\t"
+ "movd %1, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
+ "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
+ "paddw %5, %%mm1 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+
+ "psrlw %6, %%mm0 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "movd %%mm0, %0 \n\t"
+
+ : "=m"(dst[x+y*stride])
+ : "m"(src[0]), "m"(src[1]),
+ "m"(src[stride]), "m"(src[stride+1]),
+ "m"(*r4), "m"(shift2)
+ );
+ src += stride;
+ }
+ src += 4-h*stride;
+ }
+}
+
+#define PREFETCH(name, op) \
+static void name(void *mem, int stride, int h){\
+ const uint8_t *p= mem;\
+ do{\
+ __asm__ volatile(#op" %0" :: "m"(*p));\
+ p+= stride;\
+ }while(--h);\
+}
+PREFETCH(prefetch_mmx2, prefetcht0)
+PREFETCH(prefetch_3dnow, prefetch)
+#undef PREFETCH
+
+#include "h264dsp_mmx.c"
+
+/* CAVS specific */
+void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
+void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx);
+
+void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+ put_pixels8_mmx(dst, src, stride, 8);
+}
+void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+ avg_pixels8_mmx(dst, src, stride, 8);
+}
+void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+ put_pixels16_mmx(dst, src, stride, 16);
+}
+void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
+ avg_pixels16_mmx(dst, src, stride, 16);
+}
+
+/* VC1 specific */
+void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
+
+void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
+ put_pixels8_mmx(dst, src, stride, 8);
+}
+
+/* external functions, from idct_mmx.c */
+void ff_mmx_idct(DCTELEM *block);
+void ff_mmxext_idct(DCTELEM *block);
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+ converted */
+#ifdef CONFIG_GPL
+static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_mmx_idct (block);
+ put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_mmx_idct (block);
+ add_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_mmxext_idct (block);
+ put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_mmxext_idct (block);
+ add_pixels_clamped_mmx(block, dest, line_size);
+}
+#endif
+static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_idct_xvid_mmx (block);
+ put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_idct_xvid_mmx (block);
+ add_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_idct_xvid_mmx2 (block);
+ put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_idct_xvid_mmx2 (block);
+ add_pixels_clamped_mmx(block, dest, line_size);
+}
+
+static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
+{
+ int i;
+ __asm__ volatile("pxor %%mm7, %%mm7":);
+ for(i=0; i<blocksize; i+=2) {
+ __asm__ volatile(
+ "movq %0, %%mm0 \n\t"
+ "movq %1, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm1, %%mm3 \n\t"
+ "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
+ "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
+ "pslld $31, %%mm2 \n\t" // keep only the sign bit
+ "pxor %%mm2, %%mm1 \n\t"
+ "movq %%mm3, %%mm4 \n\t"
+ "pand %%mm1, %%mm3 \n\t"
+ "pandn %%mm1, %%mm4 \n\t"
+ "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+ "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm0, %0 \n\t"
+ :"+m"(mag[i]), "+m"(ang[i])
+ ::"memory"
+ );
+ }
+ __asm__ volatile("femms");
+}
+static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
+{
+ int i;
+
+ __asm__ volatile(
+ "movaps %0, %%xmm5 \n\t"
+ ::"m"(ff_pdw_80000000[0])
+ );
+ for(i=0; i<blocksize; i+=4) {
+ __asm__ volatile(
+ "movaps %0, %%xmm0 \n\t"
+ "movaps %1, %%xmm1 \n\t"
+ "xorps %%xmm2, %%xmm2 \n\t"
+ "xorps %%xmm3, %%xmm3 \n\t"
+ "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
+ "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
+ "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
+ "xorps %%xmm2, %%xmm1 \n\t"
+ "movaps %%xmm3, %%xmm4 \n\t"
+ "andps %%xmm1, %%xmm3 \n\t"
+ "andnps %%xmm1, %%xmm4 \n\t"
+ "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+ "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+ "movaps %%xmm3, %1 \n\t"
+ "movaps %%xmm0, %0 \n\t"
+ :"+m"(mag[i]), "+m"(ang[i])
+ ::"memory"
+ );
+ }
+}
+
+#define IF1(x) x
+#define IF0(x)
+
+#define MIX5(mono,stereo)\
+ __asm__ volatile(\
+ "movss 0(%2), %%xmm5 \n"\
+ "movss 8(%2), %%xmm6 \n"\
+ "movss 24(%2), %%xmm7 \n"\
+ "shufps $0, %%xmm5, %%xmm5 \n"\
+ "shufps $0, %%xmm6, %%xmm6 \n"\
+ "shufps $0, %%xmm7, %%xmm7 \n"\
+ "1: \n"\
+ "movaps (%0,%1), %%xmm0 \n"\
+ "movaps 0x400(%0,%1), %%xmm1 \n"\
+ "movaps 0x800(%0,%1), %%xmm2 \n"\
+ "movaps 0xc00(%0,%1), %%xmm3 \n"\
+ "movaps 0x1000(%0,%1), %%xmm4 \n"\
+ "mulps %%xmm5, %%xmm0 \n"\
+ "mulps %%xmm6, %%xmm1 \n"\
+ "mulps %%xmm5, %%xmm2 \n"\
+ "mulps %%xmm7, %%xmm3 \n"\
+ "mulps %%xmm7, %%xmm4 \n"\
+ stereo("addps %%xmm1, %%xmm0 \n")\
+ "addps %%xmm1, %%xmm2 \n"\
+ "addps %%xmm3, %%xmm0 \n"\
+ "addps %%xmm4, %%xmm2 \n"\
+ mono("addps %%xmm2, %%xmm0 \n")\
+ "movaps %%xmm0, (%0,%1) \n"\
+ stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
+ "add $16, %0 \n"\
+ "jl 1b \n"\
+ :"+&r"(i)\
+ :"r"(samples[0]+len), "r"(matrix)\
+ :"memory"\
+ );
+
+#define MIX_MISC(stereo)\
+ __asm__ volatile(\
+ "1: \n"\
+ "movaps (%3,%0), %%xmm0 \n"\
+ stereo("movaps %%xmm0, %%xmm1 \n")\
+ "mulps %%xmm6, %%xmm0 \n"\
+ stereo("mulps %%xmm7, %%xmm1 \n")\
+ "lea 1024(%3,%0), %1 \n"\
+ "mov %5, %2 \n"\
+ "2: \n"\
+ "movaps (%1), %%xmm2 \n"\
+ stereo("movaps %%xmm2, %%xmm3 \n")\
+ "mulps (%4,%2), %%xmm2 \n"\
+ stereo("mulps 16(%4,%2), %%xmm3 \n")\
+ "addps %%xmm2, %%xmm0 \n"\
+ stereo("addps %%xmm3, %%xmm1 \n")\
+ "add $1024, %1 \n"\
+ "add $32, %2 \n"\
+ "jl 2b \n"\
+ "movaps %%xmm0, (%3,%0) \n"\
+ stereo("movaps %%xmm1, 1024(%3,%0) \n")\
+ "add $16, %0 \n"\
+ "jl 1b \n"\
+ :"+&r"(i), "=&r"(j), "=&r"(k)\
+ :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
+ :"memory"\
+ );
+
+static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
+{
+ int (*matrix_cmp)[2] = (int(*)[2])matrix;
+ intptr_t i,j,k;
+
+ i = -len*sizeof(float);
+ if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
+ MIX5(IF0,IF1);
+ } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
+ MIX5(IF1,IF0);
+ } else {
+ DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]);
+ j = 2*in_ch*sizeof(float);
+ __asm__ volatile(
+ "1: \n"
+ "sub $8, %0 \n"
+ "movss (%2,%0), %%xmm6 \n"
+ "movss 4(%2,%0), %%xmm7 \n"
+ "shufps $0, %%xmm6, %%xmm6 \n"
+ "shufps $0, %%xmm7, %%xmm7 \n"
+ "movaps %%xmm6, (%1,%0,4) \n"
+ "movaps %%xmm7, 16(%1,%0,4) \n"
+ "jg 1b \n"
+ :"+&r"(j)
+ :"r"(matrix_simd), "r"(matrix)
+ :"memory"
+ );
+ if(out_ch == 2) {
+ MIX_MISC(IF1);
+ } else {
+ MIX_MISC(IF0);
+ }
+ }
+}
+
+static void vector_fmul_3dnow(float *dst, const float *src, int len){
+ x86_reg i = (len-4)*4;
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%1,%0), %%mm0 \n\t"
+ "movq 8(%1,%0), %%mm1 \n\t"
+ "pfmul (%2,%0), %%mm0 \n\t"
+ "pfmul 8(%2,%0), %%mm1 \n\t"
+ "movq %%mm0, (%1,%0) \n\t"
+ "movq %%mm1, 8(%1,%0) \n\t"
+ "sub $16, %0 \n\t"
+ "jge 1b \n\t"
+ "femms \n\t"
+ :"+r"(i)
+ :"r"(dst), "r"(src)
+ :"memory"
+ );
+}
+static void vector_fmul_sse(float *dst, const float *src, int len){
+ x86_reg i = (len-8)*4;
+ __asm__ volatile(
+ "1: \n\t"
+ "movaps (%1,%0), %%xmm0 \n\t"
+ "movaps 16(%1,%0), %%xmm1 \n\t"
+ "mulps (%2,%0), %%xmm0 \n\t"
+ "mulps 16(%2,%0), %%xmm1 \n\t"
+ "movaps %%xmm0, (%1,%0) \n\t"
+ "movaps %%xmm1, 16(%1,%0) \n\t"
+ "sub $32, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i)
+ :"r"(dst), "r"(src)
+ :"memory"
+ );
+}
+
+static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
+ x86_reg i = len*4-16;
+ __asm__ volatile(
+ "1: \n\t"
+ "pswapd 8(%1), %%mm0 \n\t"
+ "pswapd (%1), %%mm1 \n\t"
+ "pfmul (%3,%0), %%mm0 \n\t"
+ "pfmul 8(%3,%0), %%mm1 \n\t"
+ "movq %%mm0, (%2,%0) \n\t"
+ "movq %%mm1, 8(%2,%0) \n\t"
+ "add $16, %1 \n\t"
+ "sub $16, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i), "+r"(src1)
+ :"r"(dst), "r"(src0)
+ );
+ __asm__ volatile("femms");
+}
+static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
+ x86_reg i = len*4-32;
+ __asm__ volatile(
+ "1: \n\t"
+ "movaps 16(%1), %%xmm0 \n\t"
+ "movaps (%1), %%xmm1 \n\t"
+ "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
+ "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
+ "mulps (%3,%0), %%xmm0 \n\t"
+ "mulps 16(%3,%0), %%xmm1 \n\t"
+ "movaps %%xmm0, (%2,%0) \n\t"
+ "movaps %%xmm1, 16(%2,%0) \n\t"
+ "add $32, %1 \n\t"
+ "sub $32, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i), "+r"(src1)
+ :"r"(dst), "r"(src0)
+ );
+}
+
+static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
+ const float *src2, int src3, int len, int step){
+ x86_reg i = (len-4)*4;
+ if(step == 2 && src3 == 0){
+ dst += (len-4)*2;
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%2,%0), %%mm0 \n\t"
+ "movq 8(%2,%0), %%mm1 \n\t"
+ "pfmul (%3,%0), %%mm0 \n\t"
+ "pfmul 8(%3,%0), %%mm1 \n\t"
+ "pfadd (%4,%0), %%mm0 \n\t"
+ "pfadd 8(%4,%0), %%mm1 \n\t"
+ "movd %%mm0, (%1) \n\t"
+ "movd %%mm1, 16(%1) \n\t"
+ "psrlq $32, %%mm0 \n\t"
+ "psrlq $32, %%mm1 \n\t"
+ "movd %%mm0, 8(%1) \n\t"
+ "movd %%mm1, 24(%1) \n\t"
+ "sub $32, %1 \n\t"
+ "sub $16, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i), "+r"(dst)
+ :"r"(src0), "r"(src1), "r"(src2)
+ :"memory"
+ );
+ }
+ else if(step == 1 && src3 == 0){
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%2,%0), %%mm0 \n\t"
+ "movq 8(%2,%0), %%mm1 \n\t"
+ "pfmul (%3,%0), %%mm0 \n\t"
+ "pfmul 8(%3,%0), %%mm1 \n\t"
+ "pfadd (%4,%0), %%mm0 \n\t"
+ "pfadd 8(%4,%0), %%mm1 \n\t"
+ "movq %%mm0, (%1,%0) \n\t"
+ "movq %%mm1, 8(%1,%0) \n\t"
+ "sub $16, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i)
+ :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
+ :"memory"
+ );
+ }
+ else
+ ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+ __asm__ volatile("femms");
+}
+static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
+ const float *src2, int src3, int len, int step){
+ x86_reg i = (len-8)*4;
+ if(step == 2 && src3 == 0){
+ dst += (len-8)*2;
+ __asm__ volatile(
+ "1: \n\t"
+ "movaps (%2,%0), %%xmm0 \n\t"
+ "movaps 16(%2,%0), %%xmm1 \n\t"
+ "mulps (%3,%0), %%xmm0 \n\t"
+ "mulps 16(%3,%0), %%xmm1 \n\t"
+ "addps (%4,%0), %%xmm0 \n\t"
+ "addps 16(%4,%0), %%xmm1 \n\t"
+ "movss %%xmm0, (%1) \n\t"
+ "movss %%xmm1, 32(%1) \n\t"
+ "movhlps %%xmm0, %%xmm2 \n\t"
+ "movhlps %%xmm1, %%xmm3 \n\t"
+ "movss %%xmm2, 16(%1) \n\t"
+ "movss %%xmm3, 48(%1) \n\t"
+ "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
+ "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
+ "movss %%xmm0, 8(%1) \n\t"
+ "movss %%xmm1, 40(%1) \n\t"
+ "movhlps %%xmm0, %%xmm2 \n\t"
+ "movhlps %%xmm1, %%xmm3 \n\t"
+ "movss %%xmm2, 24(%1) \n\t"
+ "movss %%xmm3, 56(%1) \n\t"
+ "sub $64, %1 \n\t"
+ "sub $32, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i), "+r"(dst)
+ :"r"(src0), "r"(src1), "r"(src2)
+ :"memory"
+ );
+ }
+ else if(step == 1 && src3 == 0){
+ __asm__ volatile(
+ "1: \n\t"
+ "movaps (%2,%0), %%xmm0 \n\t"
+ "movaps 16(%2,%0), %%xmm1 \n\t"
+ "mulps (%3,%0), %%xmm0 \n\t"
+ "mulps 16(%3,%0), %%xmm1 \n\t"
+ "addps (%4,%0), %%xmm0 \n\t"
+ "addps 16(%4,%0), %%xmm1 \n\t"
+ "movaps %%xmm0, (%1,%0) \n\t"
+ "movaps %%xmm1, 16(%1,%0) \n\t"
+ "sub $32, %0 \n\t"
+ "jge 1b \n\t"
+ :"+r"(i)
+ :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
+ :"memory"
+ );
+ }
+ else
+ ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+}
+
+static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
+ const float *win, float add_bias, int len){
+#ifdef HAVE_6REGS
+ if(add_bias == 0){
+ x86_reg i = -len*4;
+ x86_reg j = len*4-8;
+ __asm__ volatile(
+ "1: \n"
+ "pswapd (%5,%1), %%mm1 \n"
+ "movq (%5,%0), %%mm0 \n"
+ "pswapd (%4,%1), %%mm5 \n"
+ "movq (%3,%0), %%mm4 \n"
+ "movq %%mm0, %%mm2 \n"
+ "movq %%mm1, %%mm3 \n"
+ "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
+ "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
+ "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
+ "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
+ "pfadd %%mm3, %%mm2 \n"
+ "pfsub %%mm0, %%mm1 \n"
+ "pswapd %%mm2, %%mm2 \n"
+ "movq %%mm1, (%2,%0) \n"
+ "movq %%mm2, (%2,%1) \n"
+ "sub $8, %1 \n"
+ "add $8, %0 \n"
+ "jl 1b \n"
+ "femms \n"
+ :"+r"(i), "+r"(j)
+ :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
+ );
+ }else
+#endif
+ ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
+}
+
+static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
+ const float *win, float add_bias, int len){
+#ifdef HAVE_6REGS
+ if(add_bias == 0){
+ x86_reg i = -len*4;
+ x86_reg j = len*4-16;
+ __asm__ volatile(
+ "1: \n"
+ "movaps (%5,%1), %%xmm1 \n"
+ "movaps (%5,%0), %%xmm0 \n"
+ "movaps (%4,%1), %%xmm5 \n"
+ "movaps (%3,%0), %%xmm4 \n"
+ "shufps $0x1b, %%xmm1, %%xmm1 \n"
+ "shufps $0x1b, %%xmm5, %%xmm5 \n"
+ "movaps %%xmm0, %%xmm2 \n"
+ "movaps %%xmm1, %%xmm3 \n"
+ "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
+ "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
+ "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
+ "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
+ "addps %%xmm3, %%xmm2 \n"
+ "subps %%xmm0, %%xmm1 \n"
+ "shufps $0x1b, %%xmm2, %%xmm2 \n"
+ "movaps %%xmm1, (%2,%0) \n"
+ "movaps %%xmm2, (%2,%1) \n"
+ "sub $16, %1 \n"
+ "add $16, %0 \n"
+ "jl 1b \n"
+ :"+r"(i), "+r"(j)
+ :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
+ );
+ }else
+#endif
+ ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
+}
+
+static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
+{
+ x86_reg i = -4*len;
+ __asm__ volatile(
+ "movss %3, %%xmm4 \n"
+ "shufps $0, %%xmm4, %%xmm4 \n"
+ "1: \n"
+ "cvtpi2ps (%2,%0), %%xmm0 \n"
+ "cvtpi2ps 8(%2,%0), %%xmm1 \n"
+ "cvtpi2ps 16(%2,%0), %%xmm2 \n"
+ "cvtpi2ps 24(%2,%0), %%xmm3 \n"
+ "movlhps %%xmm1, %%xmm0 \n"
+ "movlhps %%xmm3, %%xmm2 \n"
+ "mulps %%xmm4, %%xmm0 \n"
+ "mulps %%xmm4, %%xmm2 \n"
+ "movaps %%xmm0, (%1,%0) \n"
+ "movaps %%xmm2, 16(%1,%0) \n"
+ "add $32, %0 \n"
+ "jl 1b \n"
+ :"+r"(i)
+ :"r"(dst+len), "r"(src+len), "m"(mul)
+ );
+}
+
+static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
+{
+ x86_reg i = -4*len;
+ __asm__ volatile(
+ "movss %3, %%xmm4 \n"
+ "shufps $0, %%xmm4, %%xmm4 \n"
+ "1: \n"
+ "cvtdq2ps (%2,%0), %%xmm0 \n"
+ "cvtdq2ps 16(%2,%0), %%xmm1 \n"
+ "mulps %%xmm4, %%xmm0 \n"
+ "mulps %%xmm4, %%xmm1 \n"
+ "movaps %%xmm0, (%1,%0) \n"
+ "movaps %%xmm1, 16(%1,%0) \n"
+ "add $32, %0 \n"
+ "jl 1b \n"
+ :"+r"(i)
+ :"r"(dst+len), "r"(src+len), "m"(mul)
+ );
+}
+
+static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
+ x86_reg reglen = len;
+ // not bit-exact: pf2id uses different rounding than C and SSE
+ __asm__ volatile(
+ "add %0 , %0 \n\t"
+ "lea (%2,%0,2) , %2 \n\t"
+ "add %0 , %1 \n\t"
+ "neg %0 \n\t"
+ "1: \n\t"
+ "pf2id (%2,%0,2) , %%mm0 \n\t"
+ "pf2id 8(%2,%0,2) , %%mm1 \n\t"
+ "pf2id 16(%2,%0,2) , %%mm2 \n\t"
+ "pf2id 24(%2,%0,2) , %%mm3 \n\t"
+ "packssdw %%mm1 , %%mm0 \n\t"
+ "packssdw %%mm3 , %%mm2 \n\t"
+ "movq %%mm0 , (%1,%0) \n\t"
+ "movq %%mm2 , 8(%1,%0) \n\t"
+ "add $16 , %0 \n\t"
+ " js 1b \n\t"
+ "femms \n\t"
+ :"+r"(reglen), "+r"(dst), "+r"(src)
+ );
+}
+static void float_to_int16_sse(int16_t *dst, const float *src, long len){
+ x86_reg reglen = len;
+ __asm__ volatile(
+ "add %0 , %0 \n\t"
+ "lea (%2,%0,2) , %2 \n\t"
+ "add %0 , %1 \n\t"
+ "neg %0 \n\t"
+ "1: \n\t"
+ "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
+ "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
+ "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
+ "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
+ "packssdw %%mm1 , %%mm0 \n\t"
+ "packssdw %%mm3 , %%mm2 \n\t"
+ "movq %%mm0 , (%1,%0) \n\t"
+ "movq %%mm2 , 8(%1,%0) \n\t"
+ "add $16 , %0 \n\t"
+ " js 1b \n\t"
+ "emms \n\t"
+ :"+r"(reglen), "+r"(dst), "+r"(src)
+ );
+}
+
+static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
+ x86_reg reglen = len;
+ __asm__ volatile(
+ "add %0 , %0 \n\t"
+ "lea (%2,%0,2) , %2 \n\t"
+ "add %0 , %1 \n\t"
+ "neg %0 \n\t"
+ "1: \n\t"
+ "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
+ "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
+ "packssdw %%xmm1 , %%xmm0 \n\t"
+ "movdqa %%xmm0 , (%1,%0) \n\t"
+ "add $16 , %0 \n\t"
+ " js 1b \n\t"
+ :"+r"(reglen), "+r"(dst), "+r"(src)
+ );
+}
+
+#ifdef HAVE_YASM
+void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
+void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
+void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
+void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
+void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
+static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
+{
+ ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
+ ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
+}
+void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
+void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
+#else
+#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
+#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
+#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
+#endif
+#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
+
+#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
+static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
+ DECLARE_ALIGNED_16(int16_t, tmp[len]);\
+ int i,j,c;\
+ for(c=0; c<channels; c++){\
+ float_to_int16_##cpu(tmp, src[c], len);\
+ for(i=0, j=c; i<len; i++, j+=channels)\
+ dst[j] = tmp[i];\
+ }\
+}\
+\
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
+ if(channels==1)\
+ float_to_int16_##cpu(dst, src[0], len);\
+ else if(channels==2){\
+ x86_reg reglen = len; \
+ const float *src0 = src[0];\
+ const float *src1 = src[1];\
+ __asm__ volatile(\
+ "shl $2, %0 \n"\
+ "add %0, %1 \n"\
+ "add %0, %2 \n"\
+ "add %0, %3 \n"\
+ "neg %0 \n"\
+ body\
+ :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
+ );\
+ }else if(channels==6){\
+ ff_float_to_int16_interleave6_##cpu(dst, src, len);\
+ }else\
+ float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
+}
+
+FLOAT_TO_INT16_INTERLEAVE(3dnow,
+ "1: \n"
+ "pf2id (%2,%0), %%mm0 \n"
+ "pf2id 8(%2,%0), %%mm1 \n"
+ "pf2id (%3,%0), %%mm2 \n"
+ "pf2id 8(%3,%0), %%mm3 \n"
+ "packssdw %%mm1, %%mm0 \n"
+ "packssdw %%mm3, %%mm2 \n"
+ "movq %%mm0, %%mm1 \n"
+ "punpcklwd %%mm2, %%mm0 \n"
+ "punpckhwd %%mm2, %%mm1 \n"
+ "movq %%mm0, (%1,%0)\n"
+ "movq %%mm1, 8(%1,%0)\n"
+ "add $16, %0 \n"
+ "js 1b \n"
+ "femms \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse,
+ "1: \n"
+ "cvtps2pi (%2,%0), %%mm0 \n"
+ "cvtps2pi 8(%2,%0), %%mm1 \n"
+ "cvtps2pi (%3,%0), %%mm2 \n"
+ "cvtps2pi 8(%3,%0), %%mm3 \n"
+ "packssdw %%mm1, %%mm0 \n"
+ "packssdw %%mm3, %%mm2 \n"
+ "movq %%mm0, %%mm1 \n"
+ "punpcklwd %%mm2, %%mm0 \n"
+ "punpckhwd %%mm2, %%mm1 \n"
+ "movq %%mm0, (%1,%0)\n"
+ "movq %%mm1, 8(%1,%0)\n"
+ "add $16, %0 \n"
+ "js 1b \n"
+ "emms \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse2,
+ "1: \n"
+ "cvtps2dq (%2,%0), %%xmm0 \n"
+ "cvtps2dq (%3,%0), %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "movhlps %%xmm0, %%xmm1 \n"
+ "punpcklwd %%xmm1, %%xmm0 \n"
+ "movdqa %%xmm0, (%1,%0) \n"
+ "add $16, %0 \n"
+ "js 1b \n"
+)
+
+static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
+ if(channels==6)
+ ff_float_to_int16_interleave6_3dn2(dst, src, len);
+ else
+ float_to_int16_interleave_3dnow(dst, src, len, channels);
+}
+
+
+void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width);
+void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width);
+void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
+void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
+void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+
+static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+ x86_reg o = -(order << 1);
+ v1 += order;
+ v2 += order;
+ __asm__ volatile(
+ "1: \n\t"
+ "movdqu (%1,%2), %%xmm0 \n\t"
+ "movdqu 16(%1,%2), %%xmm1 \n\t"
+ "paddw (%0,%2), %%xmm0 \n\t"
+ "paddw 16(%0,%2), %%xmm1 \n\t"
+ "movdqa %%xmm0, (%0,%2) \n\t"
+ "movdqa %%xmm1, 16(%0,%2) \n\t"
+ "add $32, %2 \n\t"
+ "js 1b \n\t"
+ : "+r"(v1), "+r"(v2), "+r"(o)
+ );
+}
+
+static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+ x86_reg o = -(order << 1);
+ v1 += order;
+ v2 += order;
+ __asm__ volatile(
+ "1: \n\t"
+ "movdqa (%0,%2), %%xmm0 \n\t"
+ "movdqa 16(%0,%2), %%xmm2 \n\t"
+ "movdqu (%1,%2), %%xmm1 \n\t"
+ "movdqu 16(%1,%2), %%xmm3 \n\t"
+ "psubw %%xmm1, %%xmm0 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ "movdqa %%xmm0, (%0,%2) \n\t"
+ "movdqa %%xmm2, 16(%0,%2) \n\t"
+ "add $32, %2 \n\t"
+ "js 1b \n\t"
+ : "+r"(v1), "+r"(v2), "+r"(o)
+ );
+}
+
+static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
+{
+ int res = 0;
+ DECLARE_ALIGNED_16(int64_t, sh);
+ x86_reg o = -(order << 1);
+
+ v1 += order;
+ v2 += order;
+ sh = shift;
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7 \n\t"
+ "1: \n\t"
+ "movdqu (%0,%3), %%xmm0 \n\t"
+ "movdqu 16(%0,%3), %%xmm1 \n\t"
+ "pmaddwd (%1,%3), %%xmm0 \n\t"
+ "pmaddwd 16(%1,%3), %%xmm1 \n\t"
+ "paddd %%xmm0, %%xmm7 \n\t"
+ "paddd %%xmm1, %%xmm7 \n\t"
+ "add $32, %3 \n\t"
+ "js 1b \n\t"
+ "movhlps %%xmm7, %%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm7 \n\t"
+ "psrad %4, %%xmm7 \n\t"
+ "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm7 \n\t"
+ "movd %%xmm7, %2 \n\t"
+ : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
+ : "m"(sh)
+ );
+ return res;
+}
+
+void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
+{
+ mm_flags = mm_support();
+
+ if (avctx->dsp_mask) {
+ if (avctx->dsp_mask & FF_MM_FORCE)
+ mm_flags |= (avctx->dsp_mask & 0xffff);
+ else
+ mm_flags &= ~(avctx->dsp_mask & 0xffff);
+ }
+
+#if 0
+ av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
+ if (mm_flags & FF_MM_MMX)
+ av_log(avctx, AV_LOG_INFO, " mmx");
+ if (mm_flags & FF_MM_MMXEXT)
+ av_log(avctx, AV_LOG_INFO, " mmxext");
+ if (mm_flags & FF_MM_3DNOW)
+ av_log(avctx, AV_LOG_INFO, " 3dnow");
+ if (mm_flags & FF_MM_SSE)
+ av_log(avctx, AV_LOG_INFO, " sse");
+ if (mm_flags & FF_MM_SSE2)
+ av_log(avctx, AV_LOG_INFO, " sse2");
+ av_log(avctx, AV_LOG_INFO, "\n");
+#endif
+
+ if (mm_flags & FF_MM_MMX) {
+ const int idct_algo= avctx->idct_algo;
+
+ if(avctx->lowres==0){
+ if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
+ c->idct_put= ff_simple_idct_put_mmx;
+ c->idct_add= ff_simple_idct_add_mmx;
+ c->idct = ff_simple_idct_mmx;
+ c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
+#ifdef CONFIG_GPL
+ }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
+ if(mm_flags & FF_MM_MMXEXT){
+ c->idct_put= ff_libmpeg2mmx2_idct_put;
+ c->idct_add= ff_libmpeg2mmx2_idct_add;
+ c->idct = ff_mmxext_idct;
+ }else{
+ c->idct_put= ff_libmpeg2mmx_idct_put;
+ c->idct_add= ff_libmpeg2mmx_idct_add;
+ c->idct = ff_mmx_idct;
+ }
+ c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
+#endif
+ }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER) &&
+ idct_algo==FF_IDCT_VP3){
+ if(mm_flags & FF_MM_SSE2){
+ c->idct_put= ff_vp3_idct_put_sse2;
+ c->idct_add= ff_vp3_idct_add_sse2;
+ c->idct = ff_vp3_idct_sse2;
+ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
+ }else{
+ c->idct_put= ff_vp3_idct_put_mmx;
+ c->idct_add= ff_vp3_idct_add_mmx;
+ c->idct = ff_vp3_idct_mmx;
+ c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
+ }
+ }else if(idct_algo==FF_IDCT_CAVS){
+ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
+ }else if(idct_algo==FF_IDCT_XVIDMMX){
+ if(mm_flags & FF_MM_SSE2){
+ c->idct_put= ff_idct_xvid_sse2_put;
+ c->idct_add= ff_idct_xvid_sse2_add;
+ c->idct = ff_idct_xvid_sse2;
+ c->idct_permutation_type= FF_SSE2_IDCT_PERM;
+ }else if(mm_flags & FF_MM_MMXEXT){
+ c->idct_put= ff_idct_xvid_mmx2_put;
+ c->idct_add= ff_idct_xvid_mmx2_add;
+ c->idct = ff_idct_xvid_mmx2;
+ }else{
+ c->idct_put= ff_idct_xvid_mmx_put;
+ c->idct_add= ff_idct_xvid_mmx_add;
+ c->idct = ff_idct_xvid_mmx;
+ }
+ }
+ }
+
+ c->put_pixels_clamped = put_pixels_clamped_mmx;
+ c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
+ c->add_pixels_clamped = add_pixels_clamped_mmx;
+ c->clear_block = clear_block_mmx;
+ c->clear_blocks = clear_blocks_mmx;
+ if (mm_flags & FF_MM_SSE)
+ c->clear_block = clear_block_sse;
+
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
+ c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
+
+ SET_HPEL_FUNCS(put, 0, 16, mmx);
+ SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
+ SET_HPEL_FUNCS(avg, 0, 16, mmx);
+ SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
+ SET_HPEL_FUNCS(put, 1, 8, mmx);
+ SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
+ SET_HPEL_FUNCS(avg, 1, 8, mmx);
+ SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
+
+ c->gmc= gmc_mmx;
+
+ c->add_bytes= add_bytes_mmx;
+ c->add_bytes_l2= add_bytes_l2_mmx;
+
+ c->draw_edges = draw_edges_mmx;
+
+ if (ENABLE_ANY_H263) {
+ c->h263_v_loop_filter= h263_v_loop_filter_mmx;
+ c->h263_h_loop_filter= h263_h_loop_filter_mmx;
+ }
+ c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
+ c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
+ c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
+
+ c->h264_idct_dc_add=
+ c->h264_idct_add= ff_h264_idct_add_mmx;
+ c->h264_idct8_dc_add=
+ c->h264_idct8_add= ff_h264_idct8_add_mmx;
+
+ c->h264_idct_add16 = ff_h264_idct_add16_mmx;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
+ c->h264_idct_add8 = ff_h264_idct_add8_mmx;
+ c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
+
+ if (mm_flags & FF_MM_MMXEXT) {
+ c->prefetch = prefetch_mmx2;
+
+ c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
+ c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
+
+ c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
+ c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
+ c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
+
+ c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
+ c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
+
+ c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
+ c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
+ c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
+
+ c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
+ c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
+ c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
+ c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
+ c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
+
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
+ c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
+ c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
+ c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
+ c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
+ c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
+
+ if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
+ c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
+ c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
+ }
+ }
+
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
+ c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
+ c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
+
+ SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
+ SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
+ SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
+ SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
+ SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
+ SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
+
+ SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
+ SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
+ SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
+ SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
+ SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
+ SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
+
+ SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
+ SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
+ SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
+ SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
+
+ c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
+ c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
+ c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
+ c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
+ c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
+ c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
+ c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
+ c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
+ c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
+ c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
+ c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
+
+ c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+ c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+ c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+ c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
+ c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
+ c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
+ c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
+ c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
+
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
+ c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
+ c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
+ c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
+ c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
+ c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
+ c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+
+ if (ENABLE_CAVS_DECODER)
+ ff_cavsdsp_init_mmx2(c, avctx);
+
+ if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
+ ff_vc1dsp_init_mmx(c, avctx);
+
+ c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
+ } else if (mm_flags & FF_MM_3DNOW) {
+ c->prefetch = prefetch_3dnow;
+
+ c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
+ c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
+
+ c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
+ c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
+ c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+
+ c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
+ c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
+
+ c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
+ c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
+ c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
+
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
+ c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
+ c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
+ c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
+ c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
+ c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
+ }
+
+ SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
+ SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
+ SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
+ SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
+ SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
+ SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
+
+ SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
+ SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
+ SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
+ SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
+ SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
+ SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
+
+ SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
+ SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
+ SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
+ SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
+
+ c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
+ c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
+
+ if (ENABLE_CAVS_DECODER)
+ ff_cavsdsp_init_3dnow(c, avctx);
+ }
+
+
+#define H264_QPEL_FUNCS(x, y, CPU)\
+ c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
+ c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
+ c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
+ c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
+ if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){
+ // these functions are slower than mmx on AMD, but faster on Intel
+/* FIXME works in most codecs, but crashes svq1 due to unaligned chroma
+ c->put_pixels_tab[0][0] = put_pixels16_sse2;
+ c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
+*/
+ H264_QPEL_FUNCS(0, 0, sse2);
+ }
+ if(mm_flags & FF_MM_SSE2){
+ c->h264_idct8_add = ff_h264_idct8_add_sse2;
+ c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
+
+ H264_QPEL_FUNCS(0, 1, sse2);
+ H264_QPEL_FUNCS(0, 2, sse2);
+ H264_QPEL_FUNCS(0, 3, sse2);
+ H264_QPEL_FUNCS(1, 1, sse2);
+ H264_QPEL_FUNCS(1, 2, sse2);
+ H264_QPEL_FUNCS(1, 3, sse2);
+ H264_QPEL_FUNCS(2, 1, sse2);
+ H264_QPEL_FUNCS(2, 2, sse2);
+ H264_QPEL_FUNCS(2, 3, sse2);
+ H264_QPEL_FUNCS(3, 1, sse2);
+ H264_QPEL_FUNCS(3, 2, sse2);
+ H264_QPEL_FUNCS(3, 3, sse2);
+ }
+#ifdef HAVE_SSSE3
+ if(mm_flags & FF_MM_SSSE3){
+ H264_QPEL_FUNCS(1, 0, ssse3);
+ H264_QPEL_FUNCS(1, 1, ssse3);
+ H264_QPEL_FUNCS(1, 2, ssse3);
+ H264_QPEL_FUNCS(1, 3, ssse3);
+ H264_QPEL_FUNCS(2, 0, ssse3);
+ H264_QPEL_FUNCS(2, 1, ssse3);
+ H264_QPEL_FUNCS(2, 2, ssse3);
+ H264_QPEL_FUNCS(2, 3, ssse3);
+ H264_QPEL_FUNCS(3, 0, ssse3);
+ H264_QPEL_FUNCS(3, 1, ssse3);
+ H264_QPEL_FUNCS(3, 2, ssse3);
+ H264_QPEL_FUNCS(3, 3, ssse3);
+ c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd;
+ c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
+ c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
+ c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
+ c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
+ c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
+ }
+#endif
+
+#if defined(CONFIG_GPL) && defined(HAVE_YASM)
+ if( mm_flags&FF_MM_MMXEXT ){
+#ifdef ARCH_X86_32
+ c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
+ c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
+#endif
+ if( mm_flags&FF_MM_SSE2 ){
+ c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
+ c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
+ c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
+ c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
+ }
+ }
+#endif
+
+#ifdef CONFIG_SNOW_DECODER
+ if(mm_flags & FF_MM_SSE2 & 0){
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
+#ifdef HAVE_7REGS
+ c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
+#endif
+ c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
+ }
+ else{
+ if(mm_flags & FF_MM_MMXEXT){
+ c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
+#ifdef HAVE_7REGS
+ c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
+#endif
+ }
+ c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
+ }
+#endif
+
+ if(mm_flags & FF_MM_3DNOW){
+ c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+ c->vector_fmul = vector_fmul_3dnow;
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->float_to_int16 = float_to_int16_3dnow;
+ c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
+ }
+ }
+ if(mm_flags & FF_MM_3DNOWEXT){
+ c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
+ c->vector_fmul_window = vector_fmul_window_3dnow2;
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
+ }
+ }
+ if(mm_flags & FF_MM_SSE){
+ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
+ c->ac3_downmix = ac3_downmix_sse;
+ c->vector_fmul = vector_fmul_sse;
+ c->vector_fmul_reverse = vector_fmul_reverse_sse;
+ c->vector_fmul_add_add = vector_fmul_add_add_sse;
+ c->vector_fmul_window = vector_fmul_window_sse;
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
+ c->float_to_int16 = float_to_int16_sse;
+ c->float_to_int16_interleave = float_to_int16_interleave_sse;
+ }
+ if(mm_flags & FF_MM_3DNOW)
+ c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+ if(mm_flags & FF_MM_SSE2){
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
+ c->float_to_int16 = float_to_int16_sse2;
+ c->float_to_int16_interleave = float_to_int16_interleave_sse2;
+ c->add_int16 = add_int16_sse2;
+ c->sub_int16 = sub_int16_sse2;
+ c->scalarproduct_int16 = scalarproduct_int16_sse2;
+ }
+ }
+
+ if (ENABLE_ENCODERS)
+ dsputilenc_init_mmx(c, avctx);
+
+#if 0
+ // for speed testing
+ get_pixels = just_return;
+ put_pixels_clamped = just_return;
+ add_pixels_clamped = just_return;
+
+ pix_abs16x16 = just_return;
+ pix_abs16x16_x2 = just_return;
+ pix_abs16x16_y2 = just_return;
+ pix_abs16x16_xy2 = just_return;
+
+ put_pixels_tab[0] = just_return;
+ put_pixels_tab[1] = just_return;
+ put_pixels_tab[2] = just_return;
+ put_pixels_tab[3] = just_return;
+
+ put_no_rnd_pixels_tab[0] = just_return;
+ put_no_rnd_pixels_tab[1] = just_return;
+ put_no_rnd_pixels_tab[2] = just_return;
+ put_no_rnd_pixels_tab[3] = just_return;
+
+ avg_pixels_tab[0] = just_return;
+ avg_pixels_tab[1] = just_return;
+ avg_pixels_tab[2] = just_return;
+ avg_pixels_tab[3] = just_return;
+
+ avg_no_rnd_pixels_tab[0] = just_return;
+ avg_no_rnd_pixels_tab[1] = just_return;
+ avg_no_rnd_pixels_tab[2] = just_return;
+ avg_no_rnd_pixels_tab[3] = just_return;
+
+ //av_fdct = just_return;
+ //ff_idct = just_return;
+#endif
+}
diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h
new file mode 100644
index 0000000000..87617e3016
--- /dev/null
+++ b/libavcodec/x86/dsputil_mmx.h
@@ -0,0 +1,154 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DSPUTIL_MMX_H
+#define AVCODEC_X86_DSPUTIL_MMX_H
+
+#include <stdint.h>
+#include "libavcodec/dsputil.h"
+
+typedef struct { uint64_t a, b; } xmm_reg;
+
+extern const uint64_t ff_bone;
+extern const uint64_t ff_wtwo;
+
+extern const uint64_t ff_pdw_80000000[2];
+
+extern const uint64_t ff_pw_3;
+extern const uint64_t ff_pw_4;
+extern const xmm_reg ff_pw_5;
+extern const xmm_reg ff_pw_8;
+extern const uint64_t ff_pw_15;
+extern const xmm_reg ff_pw_16;
+extern const uint64_t ff_pw_20;
+extern const xmm_reg ff_pw_28;
+extern const xmm_reg ff_pw_32;
+extern const uint64_t ff_pw_42;
+extern const uint64_t ff_pw_64;
+extern const uint64_t ff_pw_96;
+extern const uint64_t ff_pw_128;
+extern const uint64_t ff_pw_255;
+
+extern const uint64_t ff_pb_1;
+extern const uint64_t ff_pb_3;
+extern const uint64_t ff_pb_7;
+extern const uint64_t ff_pb_1F;
+extern const uint64_t ff_pb_3F;
+extern const uint64_t ff_pb_81;
+extern const uint64_t ff_pb_A1;
+extern const uint64_t ff_pb_FC;
+
+extern const double ff_pd_1[2];
+extern const double ff_pd_2[2];
+
+#define LOAD4(stride,in,a,b,c,d)\
+ "movq 0*"#stride"+"#in", "#a"\n\t"\
+ "movq 1*"#stride"+"#in", "#b"\n\t"\
+ "movq 2*"#stride"+"#in", "#c"\n\t"\
+ "movq 3*"#stride"+"#in", "#d"\n\t"
+
+#define STORE4(stride,out,a,b,c,d)\
+ "movq "#a", 0*"#stride"+"#out"\n\t"\
+ "movq "#b", 1*"#stride"+"#out"\n\t"\
+ "movq "#c", 2*"#stride"+"#out"\n\t"\
+ "movq "#d", 3*"#stride"+"#out"\n\t"
+
+/* in/out: mma=mma+mmb, mmb=mmb-mma */
+#define SUMSUB_BA( a, b ) \
+ "paddw "#b", "#a" \n\t"\
+ "paddw "#b", "#b" \n\t"\
+ "psubw "#a", "#b" \n\t"
+
+#define SBUTTERFLY(a,b,t,n,m)\
+ "mov" #m " " #a ", " #t " \n\t" /* abcd */\
+ "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
+ "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
+
+#define TRANSPOSE4(a,b,c,d,t)\
+ SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
+ SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
+ SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
+ SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
+
+// e,f,g,h can be memory
+// out: a,d,t,c
+#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\
+ "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\
+ "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\
+ "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\
+ "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\
+ SBUTTERFLY(a, b, t, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
+ /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\
+ SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
+ /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
+ SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
+ /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
+ SBUTTERFLY(t, b, c, wd, q) /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\
+ /* c= a3 b3 c3 d3 e3 f3 g3 h3 */
+
+#ifdef ARCH_X86_64
+// permutes 01234567 -> 05736421
+#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
+ SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
+ SBUTTERFLY(c,d,b,wd,dqa)\
+ SBUTTERFLY(e,f,d,wd,dqa)\
+ SBUTTERFLY(g,h,f,wd,dqa)\
+ SBUTTERFLY(a,c,h,dq,dqa)\
+ SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
+ SBUTTERFLY(e,g,b,dq,dqa)\
+ SBUTTERFLY(d,f,g,dq,dqa)\
+ SBUTTERFLY(a,e,f,qdq,dqa)\
+ SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
+ SBUTTERFLY(h,b,d,qdq,dqa)\
+ SBUTTERFLY(c,g,b,qdq,dqa)\
+ "movdqa %%xmm8, "#g" \n\t"
+#else
+#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
+ "movdqa "#h", "#t" \n\t"\
+ SBUTTERFLY(a,b,h,wd,dqa)\
+ "movdqa "#h", 16"#t" \n\t"\
+ "movdqa "#t", "#h" \n\t"\
+ SBUTTERFLY(c,d,b,wd,dqa)\
+ SBUTTERFLY(e,f,d,wd,dqa)\
+ SBUTTERFLY(g,h,f,wd,dqa)\
+ SBUTTERFLY(a,c,h,dq,dqa)\
+ "movdqa "#h", "#t" \n\t"\
+ "movdqa 16"#t", "#h" \n\t"\
+ SBUTTERFLY(h,b,c,dq,dqa)\
+ SBUTTERFLY(e,g,b,dq,dqa)\
+ SBUTTERFLY(d,f,g,dq,dqa)\
+ SBUTTERFLY(a,e,f,qdq,dqa)\
+ SBUTTERFLY(h,d,e,qdq,dqa)\
+ "movdqa "#h", 16"#t" \n\t"\
+ "movdqa "#t", "#h" \n\t"\
+ SBUTTERFLY(h,b,d,qdq,dqa)\
+ SBUTTERFLY(c,g,b,qdq,dqa)\
+ "movdqa 16"#t", "#g" \n\t"
+#endif
+
+#define MOVQ_WONE(regd) \
+ __asm__ volatile ( \
+ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
+ "psrlw $15, %%" #regd ::)
+
+void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx);
+
+#endif /* AVCODEC_X86_DSPUTIL_MMX_H */
diff --git a/libavcodec/x86/dsputil_mmx_avg_template.c b/libavcodec/x86/dsputil_mmx_avg_template.c
new file mode 100644
index 0000000000..616a12b44b
--- /dev/null
+++ b/libavcodec/x86/dsputil_mmx_avg_template.c
@@ -0,0 +1,896 @@
+/*
+ * DSP utils : average functions are compiled twice for 3dnow/mmx2
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
+ clobber bug - now it will work with 2.95.2 and also with -fPIC
+ */
+static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ PAVGB" 1(%1), %%mm0 \n\t"
+ PAVGB" 1(%1, %3), %%mm1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ PAVGB" 1(%1), %%mm0 \n\t"
+ PAVGB" 1(%1, %3), %%mm1 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r" ((x86_reg)line_size)
+ :"%"REG_a, "memory");
+}
+
+static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ __asm__ volatile(
+ "testl $1, %0 \n\t"
+ " jz 1f \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd (%2), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "add $4, %2 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ "movd %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "decl %0 \n\t"
+ "1: \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "add %4, %1 \n\t"
+ "movd (%1), %%mm1 \n\t"
+ "movd (%2), %%mm2 \n\t"
+ "movd 4(%2), %%mm3 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" %%mm2, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm1 \n\t"
+ "movd %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movd %%mm1, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "add %4, %1 \n\t"
+ "movd (%1), %%mm1 \n\t"
+ "movd 8(%2), %%mm2 \n\t"
+ "movd 12(%2), %%mm3 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" %%mm2, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm1 \n\t"
+ "movd %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movd %%mm1, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "add $16, %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+ :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+ :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+ :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+ :"memory");
+}
+
+
+static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ __asm__ volatile(
+ "testl $1, %0 \n\t"
+ " jz 1f \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%2), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "add $8, %2 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "decl %0 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "add %4, %1 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" (%2), %%mm0 \n\t"
+ PAVGB" 8(%2), %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq %%mm1, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "add %4, %1 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" 16(%2), %%mm0 \n\t"
+ PAVGB" 24(%2), %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq %%mm1, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "add $32, %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+ :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+ :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+ :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+ :"memory");
+//the following should be used, though better not with gcc ...
+/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+ :"r"(src1Stride), "r"(dstStride)
+ :"memory");*/
+}
+
+static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ __asm__ volatile(
+ "pcmpeqb %%mm6, %%mm6 \n\t"
+ "testl $1, %0 \n\t"
+ " jz 1f \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%2), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "add $8, %2 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "decl %0 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "add %4, %1 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "movq (%2), %%mm2 \n\t"
+ "movq 8(%2), %%mm3 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ "pxor %%mm6, %%mm2 \n\t"
+ "pxor %%mm6, %%mm3 \n\t"
+ PAVGB" %%mm2, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm1 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq %%mm1, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "add %4, %1 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "movq 16(%2), %%mm2 \n\t"
+ "movq 24(%2), %%mm3 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ "pxor %%mm6, %%mm2 \n\t"
+ "pxor %%mm6, %%mm3 \n\t"
+ PAVGB" %%mm2, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm1 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq %%mm1, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "add $32, %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+ :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+ :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+ :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+ :"memory");
+//the following should be used, though better not with gcc ...
+/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+ :"r"(src1Stride), "r"(dstStride)
+ :"memory");*/
+}
+
+static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ __asm__ volatile(
+ "testl $1, %0 \n\t"
+ " jz 1f \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd (%2), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "add $4, %2 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" (%3), %%mm0 \n\t"
+ "movd %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "decl %0 \n\t"
+ "1: \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "add %4, %1 \n\t"
+ "movd (%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" (%2), %%mm0 \n\t"
+ PAVGB" 4(%2), %%mm1 \n\t"
+ PAVGB" (%3), %%mm0 \n\t"
+ "movd %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ PAVGB" (%3), %%mm1 \n\t"
+ "movd %%mm1, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "add %4, %1 \n\t"
+ "movd (%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" 8(%2), %%mm0 \n\t"
+ PAVGB" 12(%2), %%mm1 \n\t"
+ PAVGB" (%3), %%mm0 \n\t"
+ "movd %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ PAVGB" (%3), %%mm1 \n\t"
+ "movd %%mm1, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "add $16, %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+ :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+ :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+ :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+ :"memory");
+}
+
+
+static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ __asm__ volatile(
+ "testl $1, %0 \n\t"
+ " jz 1f \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%2), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "add $8, %2 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" (%3), %%mm0 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "decl %0 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "add %4, %1 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" (%2), %%mm0 \n\t"
+ PAVGB" 8(%2), %%mm1 \n\t"
+ PAVGB" (%3), %%mm0 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ PAVGB" (%3), %%mm1 \n\t"
+ "movq %%mm1, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "add %4, %1 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" 16(%2), %%mm0 \n\t"
+ PAVGB" 24(%2), %%mm1 \n\t"
+ PAVGB" (%3), %%mm0 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "add %5, %3 \n\t"
+ PAVGB" (%3), %%mm1 \n\t"
+ "movq %%mm1, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "add $32, %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+ :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+ :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+ :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+ :"memory");
+//the following should be used, though better not with gcc ...
+/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+ :"r"(src1Stride), "r"(dstStride)
+ :"memory");*/
+}
+
+static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq 8(%1, %3), %%mm3 \n\t"
+ PAVGB" 1(%1), %%mm0 \n\t"
+ PAVGB" 1(%1, %3), %%mm1 \n\t"
+ PAVGB" 9(%1), %%mm2 \n\t"
+ PAVGB" 9(%1, %3), %%mm3 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "movq %%mm2, 8(%2) \n\t"
+ "movq %%mm3, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq 8(%1, %3), %%mm3 \n\t"
+ PAVGB" 1(%1), %%mm0 \n\t"
+ PAVGB" 1(%1, %3), %%mm1 \n\t"
+ PAVGB" 9(%1), %%mm2 \n\t"
+ PAVGB" 9(%1, %3), %%mm3 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "movq %%mm2, 8(%2) \n\t"
+ "movq %%mm3, 8(%2, %3) \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r" ((x86_reg)line_size)
+ :"%"REG_a, "memory");
+}
+
+static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ __asm__ volatile(
+ "testl $1, %0 \n\t"
+ " jz 1f \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ PAVGB" (%2), %%mm0 \n\t"
+ PAVGB" 8(%2), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "add $16, %2 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "movq %%mm1, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "decl %0 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" (%2), %%mm0 \n\t"
+ PAVGB" 8(%2), %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "movq %%mm1, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" 16(%2), %%mm0 \n\t"
+ PAVGB" 24(%2), %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "movq %%mm1, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "add $32, %2 \n\t"
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+ :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+ :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+ :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+ :"memory");
+//the following should be used, though better not with gcc ...
+/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+ :"r"(src1Stride), "r"(dstStride)
+ :"memory");*/
+}
+
+static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ __asm__ volatile(
+ "testl $1, %0 \n\t"
+ " jz 1f \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ PAVGB" (%2), %%mm0 \n\t"
+ PAVGB" 8(%2), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "add $16, %2 \n\t"
+ PAVGB" (%3), %%mm0 \n\t"
+ PAVGB" 8(%3), %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "movq %%mm1, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "decl %0 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" (%2), %%mm0 \n\t"
+ PAVGB" 8(%2), %%mm1 \n\t"
+ PAVGB" (%3), %%mm0 \n\t"
+ PAVGB" 8(%3), %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "movq %%mm1, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ PAVGB" 16(%2), %%mm0 \n\t"
+ PAVGB" 24(%2), %%mm1 \n\t"
+ PAVGB" (%3), %%mm0 \n\t"
+ PAVGB" 8(%3), %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "movq %%mm1, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "add $32, %2 \n\t"
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+ :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+ :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+ :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+ :"memory");
+//the following should be used, though better not with gcc ...
+/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+ :"r"(src1Stride), "r"(dstStride)
+ :"memory");*/
+}
+
+static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ __asm__ volatile(
+ "pcmpeqb %%mm6, %%mm6 \n\t"
+ "testl $1, %0 \n\t"
+ " jz 1f \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ "movq (%2), %%mm2 \n\t"
+ "movq 8(%2), %%mm3 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ "pxor %%mm6, %%mm2 \n\t"
+ "pxor %%mm6, %%mm3 \n\t"
+ PAVGB" %%mm2, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm1 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "add $16, %2 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "movq %%mm1, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "decl %0 \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "movq (%2), %%mm2 \n\t"
+ "movq 8(%2), %%mm3 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ "pxor %%mm6, %%mm2 \n\t"
+ "pxor %%mm6, %%mm3 \n\t"
+ PAVGB" %%mm2, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm1 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "movq %%mm1, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "movq 16(%2), %%mm2 \n\t"
+ "movq 24(%2), %%mm3 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ "pxor %%mm6, %%mm2 \n\t"
+ "pxor %%mm6, %%mm3 \n\t"
+ PAVGB" %%mm2, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm1 \n\t"
+ "pxor %%mm6, %%mm0 \n\t"
+ "pxor %%mm6, %%mm1 \n\t"
+ "movq %%mm0, (%3) \n\t"
+ "movq %%mm1, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "add $32, %2 \n\t"
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+ :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+ :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+ :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+ :"memory");
+//the following should be used, though better not with gcc ...
+/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
+ :"r"(src1Stride), "r"(dstStride)
+ :"memory");*/
+}
+
+/* GL: this function does incorrect rounding if overflow */
+static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BONE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "psubusb %%mm6, %%mm0 \n\t"
+ "psubusb %%mm6, %%mm2 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm2, (%2, %3) \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "psubusb %%mm6, %%mm0 \n\t"
+ "psubusb %%mm6, %%mm2 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm2, (%2, %3) \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r" ((x86_reg)line_size)
+ :"%"REG_a, "memory");
+}
+
+static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "sub %3, %2 \n\t"
+ "1: \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" %%mm2, %%mm1 \n\t"
+ "movq %%mm0, (%2, %3) \n\t"
+ "movq %%mm1, (%2, %%"REG_a") \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ PAVGB" %%mm1, %%mm2 \n\t"
+ PAVGB" %%mm0, %%mm1 \n\t"
+ "movq %%mm2, (%2, %3) \n\t"
+ "movq %%mm1, (%2, %%"REG_a") \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D" (block)
+ :"r" ((x86_reg)line_size)
+ :"%"REG_a, "memory");
+}
+
+/* GL: this function does incorrect rounding if overflow */
+static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BONE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "sub %3, %2 \n\t"
+ "1: \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "psubusb %%mm6, %%mm1 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" %%mm2, %%mm1 \n\t"
+ "movq %%mm0, (%2, %3) \n\t"
+ "movq %%mm1, (%2, %%"REG_a") \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "psubusb %%mm6, %%mm1 \n\t"
+ PAVGB" %%mm1, %%mm2 \n\t"
+ PAVGB" %%mm0, %%mm1 \n\t"
+ "movq %%mm2, (%2, %3) \n\t"
+ "movq %%mm1, (%2, %%"REG_a") \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D" (block)
+ :"r" ((x86_reg)line_size)
+ :"%"REG_a, "memory");
+}
+
+static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "1: \n\t"
+ "movq (%2), %%mm0 \n\t"
+ "movq (%2, %3), %%mm1 \n\t"
+ PAVGB" (%1), %%mm0 \n\t"
+ PAVGB" (%1, %3), %%mm1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%2), %%mm0 \n\t"
+ "movq (%2, %3), %%mm1 \n\t"
+ PAVGB" (%1), %%mm0 \n\t"
+ PAVGB" (%1, %3), %%mm1 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r" ((x86_reg)line_size)
+ :"%"REG_a, "memory");
+}
+
+static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ PAVGB" 1(%1), %%mm0 \n\t"
+ PAVGB" 1(%1, %3), %%mm2 \n\t"
+ PAVGB" (%2), %%mm0 \n\t"
+ PAVGB" (%2, %3), %%mm2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm2, (%2, %3) \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ PAVGB" 1(%1), %%mm0 \n\t"
+ PAVGB" 1(%1, %3), %%mm2 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ PAVGB" (%2), %%mm0 \n\t"
+ PAVGB" (%2, %3), %%mm2 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm2, (%2, %3) \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r" ((x86_reg)line_size)
+ :"%"REG_a, "memory");
+}
+
+static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "sub %3, %2 \n\t"
+ "1: \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" %%mm2, %%mm1 \n\t"
+ "movq (%2, %3), %%mm3 \n\t"
+ "movq (%2, %%"REG_a"), %%mm4 \n\t"
+ PAVGB" %%mm3, %%mm0 \n\t"
+ PAVGB" %%mm4, %%mm1 \n\t"
+ "movq %%mm0, (%2, %3) \n\t"
+ "movq %%mm1, (%2, %%"REG_a") \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ PAVGB" %%mm1, %%mm2 \n\t"
+ PAVGB" %%mm0, %%mm1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "movq (%2, %3), %%mm3 \n\t"
+ "movq (%2, %%"REG_a"), %%mm4 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ PAVGB" %%mm4, %%mm1 \n\t"
+ "movq %%mm2, (%2, %3) \n\t"
+ "movq %%mm1, (%2, %%"REG_a") \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r" ((x86_reg)line_size)
+ :"%"REG_a, "memory");
+}
+
+/* Note this is not correctly rounded, but this function is only
+ * used for B-frames so it does not matter. */
+static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BONE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "movq (%1), %%mm0 \n\t"
+ PAVGB" 1(%1), %%mm0 \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm2 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "psubusb %%mm6, %%mm2 \n\t"
+ PAVGB" 1(%1, %3), %%mm1 \n\t"
+ PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" %%mm2, %%mm1 \n\t"
+ PAVGB" (%2), %%mm0 \n\t"
+ PAVGB" (%2, %3), %%mm1 \n\t"
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ PAVGB" 1(%1, %3), %%mm1 \n\t"
+ PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "add %%"REG_a", %1 \n\t"
+ PAVGB" %%mm1, %%mm2 \n\t"
+ PAVGB" %%mm0, %%mm1 \n\t"
+ PAVGB" (%2), %%mm2 \n\t"
+ PAVGB" (%2, %3), %%mm1 \n\t"
+ "movq %%mm2, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r" ((x86_reg)line_size)
+ :"%"REG_a, "memory");
+}
+
+static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ do {
+ __asm__ volatile(
+ "movd (%1), %%mm0 \n\t"
+ "movd (%1, %2), %%mm1 \n\t"
+ "movd (%1, %2, 2), %%mm2 \n\t"
+ "movd (%1, %3), %%mm3 \n\t"
+ PAVGB" (%0), %%mm0 \n\t"
+ PAVGB" (%0, %2), %%mm1 \n\t"
+ PAVGB" (%0, %2, 2), %%mm2 \n\t"
+ PAVGB" (%0, %3), %%mm3 \n\t"
+ "movd %%mm0, (%1) \n\t"
+ "movd %%mm1, (%1, %2) \n\t"
+ "movd %%mm2, (%1, %2, 2) \n\t"
+ "movd %%mm3, (%1, %3) \n\t"
+ ::"S"(pixels), "D"(block),
+ "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
+ :"memory");
+ block += 4*line_size;
+ pixels += 4*line_size;
+ h -= 4;
+ } while(h > 0);
+}
+
+//FIXME the following could be optimized too ...
+static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
+ DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(put_pixels8_y2)(block , pixels , line_size, h);
+ DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
+ DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(avg_pixels8)(block , pixels , line_size, h);
+ DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(avg_pixels8_x2)(block , pixels , line_size, h);
+ DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(avg_pixels8_y2)(block , pixels , line_size, h);
+ DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
+ DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+#define QPEL_2TAP_L3(OPNAME) \
+static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movq (%1,%2), %%mm0 \n\t"\
+ "movq 8(%1,%2), %%mm1 \n\t"\
+ PAVGB" (%1,%3), %%mm0 \n\t"\
+ PAVGB" 8(%1,%3), %%mm1 \n\t"\
+ PAVGB" (%1), %%mm0 \n\t"\
+ PAVGB" 8(%1), %%mm1 \n\t"\
+ STORE_OP( (%1,%4),%%mm0)\
+ STORE_OP(8(%1,%4),%%mm1)\
+ "movq %%mm0, (%1,%4) \n\t"\
+ "movq %%mm1, 8(%1,%4) \n\t"\
+ "add %5, %1 \n\t"\
+ "decl %0 \n\t"\
+ "jnz 1b \n\t"\
+ :"+g"(h), "+r"(src)\
+ :"r"((x86_reg)off1), "r"((x86_reg)off2),\
+ "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
+ :"memory"\
+ );\
+}\
+static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movq (%1,%2), %%mm0 \n\t"\
+ PAVGB" (%1,%3), %%mm0 \n\t"\
+ PAVGB" (%1), %%mm0 \n\t"\
+ STORE_OP((%1,%4),%%mm0)\
+ "movq %%mm0, (%1,%4) \n\t"\
+ "add %5, %1 \n\t"\
+ "decl %0 \n\t"\
+ "jnz 1b \n\t"\
+ :"+g"(h), "+r"(src)\
+ :"r"((x86_reg)off1), "r"((x86_reg)off2),\
+ "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
+ :"memory"\
+ );\
+}
+
+#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
+QPEL_2TAP_L3(avg_)
+#undef STORE_OP
+#define STORE_OP(a,b)
+QPEL_2TAP_L3(put_)
+#undef STORE_OP
+#undef QPEL_2TAP_L3
diff --git a/libavcodec/x86/dsputil_mmx_qns_template.c b/libavcodec/x86/dsputil_mmx_qns_template.c
new file mode 100644
index 0000000000..d2dbfc5a58
--- /dev/null
+++ b/libavcodec/x86/dsputil_mmx_qns_template.c
@@ -0,0 +1,101 @@
+/*
+ * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
+ * Copyright (c) 2004 Michael Niedermayer
+ *
+ * MMX optimization by Michael Niedermayer <michaelni@gmx.at>
+ * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
+
+static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
+{
+ x86_reg i=0;
+
+ assert(FFABS(scale) < MAX_ABS);
+ scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+
+ SET_RND(mm6);
+ __asm__ volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "movd %4, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+ "paddw (%2, %0), %%mm0 \n\t"
+ "paddw 8(%2, %0), %%mm1 \n\t"
+ "psraw $6, %%mm0 \n\t"
+ "psraw $6, %%mm1 \n\t"
+ "pmullw (%3, %0), %%mm0 \n\t"
+ "pmullw 8(%3, %0), %%mm1 \n\t"
+ "pmaddwd %%mm0, %%mm0 \n\t"
+ "pmaddwd %%mm1, %%mm1 \n\t"
+ "paddd %%mm1, %%mm0 \n\t"
+ "psrld $4, %%mm0 \n\t"
+ "paddd %%mm0, %%mm7 \n\t"
+ "add $16, %0 \n\t"
+ "cmp $128, %0 \n\t" //FIXME optimize & bench
+ " jb 1b \n\t"
+ PHADDD(%%mm7, %%mm6)
+ "psrld $2, %%mm7 \n\t"
+ "movd %%mm7, %0 \n\t"
+
+ : "+r" (i)
+ : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
+ );
+ return i;
+}
+
+static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
+{
+ x86_reg i=0;
+
+ if(FFABS(scale) < MAX_ABS){
+ scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+ SET_RND(mm6);
+ __asm__ volatile(
+ "movd %3, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+ "paddw (%2, %0), %%mm0 \n\t"
+ "paddw 8(%2, %0), %%mm1 \n\t"
+ "movq %%mm0, (%2, %0) \n\t"
+ "movq %%mm1, 8(%2, %0) \n\t"
+ "add $16, %0 \n\t"
+ "cmp $128, %0 \n\t" // FIXME optimize & bench
+ " jb 1b \n\t"
+
+ : "+r" (i)
+ : "r"(basis), "r"(rem), "g"(scale)
+ );
+ }else{
+ for(i=0; i<8*8; i++){
+ rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
+ }
+ }
+}
diff --git a/libavcodec/x86/dsputil_mmx_rnd_template.c b/libavcodec/x86/dsputil_mmx_rnd_template.c
new file mode 100644
index 0000000000..45ed59088e
--- /dev/null
+++ b/libavcodec/x86/dsputil_mmx_rnd_template.c
@@ -0,0 +1,590 @@
+/*
+ * DSP utils mmx functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
+ * and improved by Zdenek Kabelac <kabi@users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+// put_pixels
+static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "testl $1, %0 \n\t"
+ " jz 1f \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%2), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "add $8, %2 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
+ "movq %%mm4, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "decl %0 \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%2), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "movq (%1), %%mm2 \n\t"
+ "movq 8(%2), %%mm3 \n\t"
+ "add %4, %1 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq %%mm5, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 16(%2), %%mm1 \n\t"
+ "add %4, %1 \n\t"
+ "movq (%1), %%mm2 \n\t"
+ "movq 24(%2), %%mm3 \n\t"
+ "add %4, %1 \n\t"
+ "add $32, %2 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq %%mm5, (%3) \n\t"
+ "add %5, %3 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+ :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+ :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+ :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+ :"memory");
+}
+
+static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "movq 8(%1), %%mm0 \n\t"
+ "movq 9(%1), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm2 \n\t"
+ "movq 9(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "movq 1(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "movq 8(%1), %%mm0 \n\t"
+ "movq 9(%1), %%mm1 \n\t"
+ "movq 8(%1, %3), %%mm2 \n\t"
+ "movq 9(%1, %3), %%mm3 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, 8(%2) \n\t"
+ "movq %%mm5, 8(%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "testl $1, %0 \n\t"
+ " jz 1f \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%2), %%mm1 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq 8(%2), %%mm3 \n\t"
+ "add %4, %1 \n\t"
+ "add $16, %2 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%3) \n\t"
+ "movq %%mm5, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "decl %0 \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%2), %%mm1 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq 8(%2), %%mm3 \n\t"
+ "add %4, %1 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%3) \n\t"
+ "movq %%mm5, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq 16(%2), %%mm1 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq 24(%2), %%mm3 \n\t"
+ "add %4, %1 \n\t"
+ PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
+ "movq %%mm4, (%3) \n\t"
+ "movq %%mm5, 8(%3) \n\t"
+ "add %5, %3 \n\t"
+ "add $32, %2 \n\t"
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
+ :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#else
+ :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
+#endif
+ :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
+ :"memory");
+}
+
+static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "movq (%1), %%mm0 \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"),%%mm2 \n\t"
+ PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"),%%mm0 \n\t"
+ PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
+ "movq %%mm4, (%2) \n\t"
+ "movq %%mm5, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_ZERO(mm7);
+ SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm4 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "add %3, %1 \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm0 \n\t"
+ "paddusw %%mm3, %%mm1 \n\t"
+ "paddusw %%mm6, %%mm4 \n\t"
+ "paddusw %%mm6, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm5 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "movq %%mm4, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm2, %%mm4 \n\t"
+ "paddusw %%mm3, %%mm5 \n\t"
+ "paddusw %%mm6, %%mm0 \n\t"
+ "paddusw %%mm6, %%mm1 \n\t"
+ "paddusw %%mm4, %%mm0 \n\t"
+ "paddusw %%mm5, %%mm1 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "movq %%mm0, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels)
+ :"D"(block), "r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+// avg_pixels
+static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "movd %1, %%mm1 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ "movd %%mm2, %0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ }
+ while (--h);
+}
+
+// in case more speed is needed - unroling would certainly help
+static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %0, %%mm0 \n\t"
+ "movq %1, %%mm1 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ "movq %%mm2, %0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ }
+ while (--h);
+}
+
+static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %0, %%mm0 \n\t"
+ "movq %1, %%mm1 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ "movq %%mm2, %0 \n\t"
+ "movq 8%0, %%mm0 \n\t"
+ "movq 8%1, %%mm1 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ "movq %%mm2, 8%0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ }
+ while (--h);
+}
+
+static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %1, %%mm0 \n\t"
+ "movq 1%1, %%mm1 \n\t"
+ "movq %0, %%mm3 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, %0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ } while (--h);
+}
+
+static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %1, %%mm0 \n\t"
+ "movq %2, %%mm1 \n\t"
+ "movq %0, %%mm3 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, %0 \n\t"
+ :"+m"(*dst)
+ :"m"(*src1), "m"(*src2)
+ :"memory");
+ dst += dstStride;
+ src1 += src1Stride;
+ src2 += 8;
+ } while (--h);
+}
+
+static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %1, %%mm0 \n\t"
+ "movq 1%1, %%mm1 \n\t"
+ "movq %0, %%mm3 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, %0 \n\t"
+ "movq 8%1, %%mm0 \n\t"
+ "movq 9%1, %%mm1 \n\t"
+ "movq 8%0, %%mm3 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, 8%0 \n\t"
+ :"+m"(*block)
+ :"m"(*pixels)
+ :"memory");
+ pixels += line_size;
+ block += line_size;
+ } while (--h);
+}
+
+static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+{
+ MOVQ_BFE(mm6);
+ JUMPALIGN();
+ do {
+ __asm__ volatile(
+ "movq %1, %%mm0 \n\t"
+ "movq %2, %%mm1 \n\t"
+ "movq %0, %%mm3 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, %0 \n\t"
+ "movq 8%1, %%mm0 \n\t"
+ "movq 8%2, %%mm1 \n\t"
+ "movq 8%0, %%mm3 \n\t"
+ PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+ PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
+ "movq %%mm0, 8%0 \n\t"
+ :"+m"(*dst)
+ :"m"(*src1), "m"(*src2)
+ :"memory");
+ dst += dstStride;
+ src1 += src1Stride;
+ src2 += 16;
+ } while (--h);
+}
+
+static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_BFE(mm6);
+ __asm__ volatile(
+ "lea (%3, %3), %%"REG_a" \n\t"
+ "movq (%1), %%mm0 \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm2 \n\t"
+ PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
+ "movq (%2), %%mm3 \n\t"
+ PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
+ "movq (%2, %3), %%mm3 \n\t"
+ PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
+ "movq %%mm0, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+
+ "movq (%1, %3), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
+ "movq (%2), %%mm3 \n\t"
+ PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
+ "movq (%2, %3), %%mm3 \n\t"
+ PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
+ "movq %%mm2, (%2) \n\t"
+ "movq %%mm1, (%2, %3) \n\t"
+ "add %%"REG_a", %1 \n\t"
+ "add %%"REG_a", %2 \n\t"
+
+ "subl $4, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels), "+D"(block)
+ :"r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+// this routine is 'slightly' suboptimal but mostly unused
+static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ MOVQ_ZERO(mm7);
+ SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm4 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "xor %%"REG_a", %%"REG_a" \n\t"
+ "add %3, %1 \n\t"
+ ASMALIGN(3)
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm0 \n\t"
+ "paddusw %%mm3, %%mm1 \n\t"
+ "paddusw %%mm6, %%mm4 \n\t"
+ "paddusw %%mm6, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm5 \n\t"
+ "movq (%2, %%"REG_a"), %%mm3 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "pcmpeqd %%mm2, %%mm2 \n\t"
+ "paddb %%mm2, %%mm2 \n\t"
+ PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
+ "movq %%mm5, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm2, %%mm4 \n\t"
+ "paddusw %%mm3, %%mm5 \n\t"
+ "paddusw %%mm6, %%mm0 \n\t"
+ "paddusw %%mm6, %%mm1 \n\t"
+ "paddusw %%mm4, %%mm0 \n\t"
+ "paddusw %%mm5, %%mm1 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "movq (%2, %%"REG_a"), %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "pcmpeqd %%mm2, %%mm2 \n\t"
+ "paddb %%mm2, %%mm2 \n\t"
+ PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
+ "movq %%mm1, (%2, %%"REG_a") \n\t"
+ "add %3, %%"REG_a" \n\t"
+
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels)
+ :"D"(block), "r"((x86_reg)line_size)
+ :REG_a, "memory");
+}
+
+//FIXME optimize
+static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(put, pixels8_y2)(block , pixels , line_size, h);
+ DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(put, pixels8_xy2)(block , pixels , line_size, h);
+ DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(avg, pixels8_y2)(block , pixels , line_size, h);
+ DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+ DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
+ DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
new file mode 100644
index 0000000000..91165f2fb7
--- /dev/null
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -0,0 +1,92 @@
+;******************************************************************************
+;* MMX optimized DSP utils
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+
+section .text align=16
+
+%macro PSWAPD_SSE 2
+ pshufw %1, %2, 0x4e
+%endmacro
+%macro PSWAPD_3DN1 2
+ movq %1, %2
+ psrlq %1, 32
+ punpckldq %1, %2
+%endmacro
+
+%macro FLOAT_TO_INT16_INTERLEAVE6 1
+; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
+cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
+%ifdef ARCH_X86_64
+ %define lend r10d
+ mov lend, r2d
+%else
+ %define lend dword r2m
+%endif
+ mov src1q, [srcq+1*gprsize]
+ mov src2q, [srcq+2*gprsize]
+ mov src3q, [srcq+3*gprsize]
+ mov src4q, [srcq+4*gprsize]
+ mov src5q, [srcq+5*gprsize]
+ mov srcq, [srcq]
+ sub src1q, srcq
+ sub src2q, srcq
+ sub src3q, srcq
+ sub src4q, srcq
+ sub src5q, srcq
+.loop:
+ cvtps2pi mm0, [srcq]
+ cvtps2pi mm1, [srcq+src1q]
+ cvtps2pi mm2, [srcq+src2q]
+ cvtps2pi mm3, [srcq+src3q]
+ cvtps2pi mm4, [srcq+src4q]
+ cvtps2pi mm5, [srcq+src5q]
+ packssdw mm0, mm3
+ packssdw mm1, mm4
+ packssdw mm2, mm5
+ pswapd mm3, mm0
+ punpcklwd mm0, mm1
+ punpckhwd mm1, mm2
+ punpcklwd mm2, mm3
+ pswapd mm3, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm2, mm1
+ punpckldq mm1, mm3
+ movq [dstq ], mm0
+ movq [dstq+16], mm2
+ movq [dstq+ 8], mm1
+ add srcq, 8
+ add dstq, 24
+ sub lend, 2
+ jg .loop
+ emms
+ RET
+%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
+
+%define pswapd PSWAPD_SSE
+FLOAT_TO_INT16_INTERLEAVE6 sse
+%define cvtps2pi pf2id
+%define pswapd PSWAPD_3DN1
+FLOAT_TO_INT16_INTERLEAVE6 3dnow
+%undef pswapd
+FLOAT_TO_INT16_INTERLEAVE6 3dn2
+%undef cvtps2pi
+
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
new file mode 100644
index 0000000000..76b367822b
--- /dev/null
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -0,0 +1,1441 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+#include "dsputil_mmx.h"
+
+
+static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
+{
+ __asm__ volatile(
+ "mov $-128, %%"REG_a" \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq (%0, %2), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "movq %%mm0, (%1, %%"REG_a") \n\t"
+ "movq %%mm1, 8(%1, %%"REG_a") \n\t"
+ "movq %%mm2, 16(%1, %%"REG_a") \n\t"
+ "movq %%mm3, 24(%1, %%"REG_a") \n\t"
+ "add %3, %0 \n\t"
+ "add $32, %%"REG_a" \n\t"
+ "js 1b \n\t"
+ : "+r" (pixels)
+ : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
+ : "%"REG_a
+ );
+}
+
+static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
+{
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7 \n\t"
+ "movq (%0), %%xmm0 \n\t"
+ "movq (%0, %2), %%xmm1 \n\t"
+ "movq (%0, %2,2), %%xmm2 \n\t"
+ "movq (%0, %3), %%xmm3 \n\t"
+ "lea (%0,%2,4), %0 \n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "punpcklbw %%xmm7, %%xmm2 \n\t"
+ "punpcklbw %%xmm7, %%xmm3 \n\t"
+ "movdqa %%xmm0, (%1) \n\t"
+ "movdqa %%xmm1, 16(%1) \n\t"
+ "movdqa %%xmm2, 32(%1) \n\t"
+ "movdqa %%xmm3, 48(%1) \n\t"
+ "movq (%0), %%xmm0 \n\t"
+ "movq (%0, %2), %%xmm1 \n\t"
+ "movq (%0, %2,2), %%xmm2 \n\t"
+ "movq (%0, %3), %%xmm3 \n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "punpcklbw %%xmm7, %%xmm2 \n\t"
+ "punpcklbw %%xmm7, %%xmm3 \n\t"
+ "movdqa %%xmm0, 64(%1) \n\t"
+ "movdqa %%xmm1, 80(%1) \n\t"
+ "movdqa %%xmm2, 96(%1) \n\t"
+ "movdqa %%xmm3, 112(%1) \n\t"
+ : "+r" (pixels)
+ : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
+ );
+}
+
+static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
+{
+ __asm__ volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "mov $-128, %%"REG_a" \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq (%1), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm3, %%mm1 \n\t"
+ "movq %%mm0, (%2, %%"REG_a") \n\t"
+ "movq %%mm1, 8(%2, %%"REG_a") \n\t"
+ "add %3, %0 \n\t"
+ "add %3, %1 \n\t"
+ "add $16, %%"REG_a" \n\t"
+ "jnz 1b \n\t"
+ : "+r" (s1), "+r" (s2)
+ : "r" (block+64), "r" ((x86_reg)stride)
+ : "%"REG_a
+ );
+}
+
+static int pix_sum16_mmx(uint8_t * pix, int line_size){
+ const int h=16;
+ int sum;
+ x86_reg index= -line_size*h;
+
+ __asm__ volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "pxor %%mm6, %%mm6 \n\t"
+ "1: \n\t"
+ "movq (%2, %1), %%mm0 \n\t"
+ "movq (%2, %1), %%mm1 \n\t"
+ "movq 8(%2, %1), %%mm2 \n\t"
+ "movq 8(%2, %1), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm0, %%mm1 \n\t"
+ "paddw %%mm2, %%mm3 \n\t"
+ "paddw %%mm1, %%mm3 \n\t"
+ "paddw %%mm3, %%mm6 \n\t"
+ "add %3, %1 \n\t"
+ " js 1b \n\t"
+ "movq %%mm6, %%mm5 \n\t"
+ "psrlq $32, %%mm6 \n\t"
+ "paddw %%mm5, %%mm6 \n\t"
+ "movq %%mm6, %%mm5 \n\t"
+ "psrlq $16, %%mm6 \n\t"
+ "paddw %%mm5, %%mm6 \n\t"
+ "movd %%mm6, %0 \n\t"
+ "andl $0xFFFF, %0 \n\t"
+ : "=&r" (sum), "+r" (index)
+ : "r" (pix - index), "r" ((x86_reg)line_size)
+ );
+
+ return sum;
+}
+
+static int pix_norm1_mmx(uint8_t *pix, int line_size) {
+ int tmp;
+ __asm__ volatile (
+ "movl $16,%%ecx\n"
+ "pxor %%mm0,%%mm0\n"
+ "pxor %%mm7,%%mm7\n"
+ "1:\n"
+ "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
+ "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
+
+ "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
+
+ "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
+ "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
+
+ "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
+ "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
+ "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
+
+ "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
+ "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
+
+ "pmaddwd %%mm3,%%mm3\n"
+ "pmaddwd %%mm4,%%mm4\n"
+
+ "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
+ pix2^2+pix3^2+pix6^2+pix7^2) */
+ "paddd %%mm3,%%mm4\n"
+ "paddd %%mm2,%%mm7\n"
+
+ "add %2, %0\n"
+ "paddd %%mm4,%%mm7\n"
+ "dec %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm7,%%mm1\n"
+ "psrlq $32, %%mm7\n" /* shift hi dword to lo */
+ "paddd %%mm7,%%mm1\n"
+ "movd %%mm1,%1\n"
+ : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
+ return tmp;
+}
+
+static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+ __asm__ volatile (
+ "movl %4,%%ecx\n"
+ "shr $1,%%ecx\n"
+ "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
+ "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
+ "1:\n"
+ "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
+ "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
+ "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
+ "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
+
+ /* todo: mm1-mm2, mm3-mm4 */
+ /* algo: subtract mm1 from mm2 with saturation and vice versa */
+ /* OR the results to get absolute difference */
+ "movq %%mm1,%%mm5\n"
+ "movq %%mm3,%%mm6\n"
+ "psubusb %%mm2,%%mm1\n"
+ "psubusb %%mm4,%%mm3\n"
+ "psubusb %%mm5,%%mm2\n"
+ "psubusb %%mm6,%%mm4\n"
+
+ "por %%mm1,%%mm2\n"
+ "por %%mm3,%%mm4\n"
+
+ /* now convert to 16-bit vectors so we can square them */
+ "movq %%mm2,%%mm1\n"
+ "movq %%mm4,%%mm3\n"
+
+ "punpckhbw %%mm0,%%mm2\n"
+ "punpckhbw %%mm0,%%mm4\n"
+ "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
+ "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
+
+ "pmaddwd %%mm2,%%mm2\n"
+ "pmaddwd %%mm4,%%mm4\n"
+ "pmaddwd %%mm1,%%mm1\n"
+ "pmaddwd %%mm3,%%mm3\n"
+
+ "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
+ "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
+
+ "paddd %%mm2,%%mm1\n"
+ "paddd %%mm4,%%mm3\n"
+ "paddd %%mm1,%%mm7\n"
+ "paddd %%mm3,%%mm7\n"
+
+ "decl %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm7,%%mm1\n"
+ "psrlq $32, %%mm7\n" /* shift hi dword to lo */
+ "paddd %%mm7,%%mm1\n"
+ "movd %%mm1,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+
+static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+ __asm__ volatile (
+ "movl %4,%%ecx\n"
+ "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
+ "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
+ "1:\n"
+ "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
+ "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
+ "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
+ "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
+
+ /* todo: mm1-mm2, mm3-mm4 */
+ /* algo: subtract mm1 from mm2 with saturation and vice versa */
+ /* OR the results to get absolute difference */
+ "movq %%mm1,%%mm5\n"
+ "movq %%mm3,%%mm6\n"
+ "psubusb %%mm2,%%mm1\n"
+ "psubusb %%mm4,%%mm3\n"
+ "psubusb %%mm5,%%mm2\n"
+ "psubusb %%mm6,%%mm4\n"
+
+ "por %%mm1,%%mm2\n"
+ "por %%mm3,%%mm4\n"
+
+ /* now convert to 16-bit vectors so we can square them */
+ "movq %%mm2,%%mm1\n"
+ "movq %%mm4,%%mm3\n"
+
+ "punpckhbw %%mm0,%%mm2\n"
+ "punpckhbw %%mm0,%%mm4\n"
+ "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
+ "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
+
+ "pmaddwd %%mm2,%%mm2\n"
+ "pmaddwd %%mm4,%%mm4\n"
+ "pmaddwd %%mm1,%%mm1\n"
+ "pmaddwd %%mm3,%%mm3\n"
+
+ "add %3,%0\n"
+ "add %3,%1\n"
+
+ "paddd %%mm2,%%mm1\n"
+ "paddd %%mm4,%%mm3\n"
+ "paddd %%mm1,%%mm7\n"
+ "paddd %%mm3,%%mm7\n"
+
+ "decl %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm7,%%mm1\n"
+ "psrlq $32, %%mm7\n" /* shift hi dword to lo */
+ "paddd %%mm7,%%mm1\n"
+ "movd %%mm1,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+
+static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+ __asm__ volatile (
+ "shr $1,%2\n"
+ "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
+ "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
+ "1:\n"
+ "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
+ "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
+ "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
+ "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
+
+ /* todo: mm1-mm2, mm3-mm4 */
+ /* algo: subtract mm1 from mm2 with saturation and vice versa */
+ /* OR the results to get absolute difference */
+ "movdqa %%xmm1,%%xmm5\n"
+ "movdqa %%xmm3,%%xmm6\n"
+ "psubusb %%xmm2,%%xmm1\n"
+ "psubusb %%xmm4,%%xmm3\n"
+ "psubusb %%xmm5,%%xmm2\n"
+ "psubusb %%xmm6,%%xmm4\n"
+
+ "por %%xmm1,%%xmm2\n"
+ "por %%xmm3,%%xmm4\n"
+
+ /* now convert to 16-bit vectors so we can square them */
+ "movdqa %%xmm2,%%xmm1\n"
+ "movdqa %%xmm4,%%xmm3\n"
+
+ "punpckhbw %%xmm0,%%xmm2\n"
+ "punpckhbw %%xmm0,%%xmm4\n"
+ "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
+ "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
+
+ "pmaddwd %%xmm2,%%xmm2\n"
+ "pmaddwd %%xmm4,%%xmm4\n"
+ "pmaddwd %%xmm1,%%xmm1\n"
+ "pmaddwd %%xmm3,%%xmm3\n"
+
+ "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
+ "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
+
+ "paddd %%xmm2,%%xmm1\n"
+ "paddd %%xmm4,%%xmm3\n"
+ "paddd %%xmm1,%%xmm7\n"
+ "paddd %%xmm3,%%xmm7\n"
+
+ "decl %2\n"
+ "jnz 1b\n"
+
+ "movdqa %%xmm7,%%xmm1\n"
+ "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
+ "paddd %%xmm1,%%xmm7\n"
+ "movdqa %%xmm7,%%xmm1\n"
+ "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
+ "paddd %%xmm1,%%xmm7\n"
+ "movd %%xmm7,%3\n"
+ : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
+ : "r" ((x86_reg)line_size));
+ return tmp;
+}
+
+static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
+ int tmp;
+ __asm__ volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm7,%%mm7\n"
+ "pxor %%mm6,%%mm6\n"
+
+ "movq (%0),%%mm0\n"
+ "movq %%mm0, %%mm1\n"
+ "psllq $8, %%mm0\n"
+ "psrlq $8, %%mm1\n"
+ "psrlq $8, %%mm0\n"
+ "movq %%mm0, %%mm2\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm0\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm2\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm0\n"
+ "psubw %%mm3, %%mm2\n"
+
+ "add %2,%0\n"
+
+ "movq (%0),%%mm4\n"
+ "movq %%mm4, %%mm1\n"
+ "psllq $8, %%mm4\n"
+ "psrlq $8, %%mm1\n"
+ "psrlq $8, %%mm4\n"
+ "movq %%mm4, %%mm5\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm4\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm5\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm4\n"
+ "psubw %%mm3, %%mm5\n"
+ "psubw %%mm4, %%mm0\n"
+ "psubw %%mm5, %%mm2\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm0, %%mm3\n\t"
+ "pcmpgtw %%mm2, %%mm1\n\t"
+ "pxor %%mm3, %%mm0\n"
+ "pxor %%mm1, %%mm2\n"
+ "psubw %%mm3, %%mm0\n"
+ "psubw %%mm1, %%mm2\n"
+ "paddw %%mm0, %%mm2\n"
+ "paddw %%mm2, %%mm6\n"
+
+ "add %2,%0\n"
+ "1:\n"
+
+ "movq (%0),%%mm0\n"
+ "movq %%mm0, %%mm1\n"
+ "psllq $8, %%mm0\n"
+ "psrlq $8, %%mm1\n"
+ "psrlq $8, %%mm0\n"
+ "movq %%mm0, %%mm2\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm0\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm2\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm0\n"
+ "psubw %%mm3, %%mm2\n"
+ "psubw %%mm0, %%mm4\n"
+ "psubw %%mm2, %%mm5\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm4, %%mm3\n\t"
+ "pcmpgtw %%mm5, %%mm1\n\t"
+ "pxor %%mm3, %%mm4\n"
+ "pxor %%mm1, %%mm5\n"
+ "psubw %%mm3, %%mm4\n"
+ "psubw %%mm1, %%mm5\n"
+ "paddw %%mm4, %%mm5\n"
+ "paddw %%mm5, %%mm6\n"
+
+ "add %2,%0\n"
+
+ "movq (%0),%%mm4\n"
+ "movq %%mm4, %%mm1\n"
+ "psllq $8, %%mm4\n"
+ "psrlq $8, %%mm1\n"
+ "psrlq $8, %%mm4\n"
+ "movq %%mm4, %%mm5\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm4\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm5\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm4\n"
+ "psubw %%mm3, %%mm5\n"
+ "psubw %%mm4, %%mm0\n"
+ "psubw %%mm5, %%mm2\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm0, %%mm3\n\t"
+ "pcmpgtw %%mm2, %%mm1\n\t"
+ "pxor %%mm3, %%mm0\n"
+ "pxor %%mm1, %%mm2\n"
+ "psubw %%mm3, %%mm0\n"
+ "psubw %%mm1, %%mm2\n"
+ "paddw %%mm0, %%mm2\n"
+ "paddw %%mm2, %%mm6\n"
+
+ "add %2,%0\n"
+ "subl $2, %%ecx\n"
+ " jnz 1b\n"
+
+ "movq %%mm6, %%mm0\n"
+ "punpcklwd %%mm7,%%mm0\n"
+ "punpckhwd %%mm7,%%mm6\n"
+ "paddd %%mm0, %%mm6\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddd %%mm6,%%mm0\n"
+ "movd %%mm0,%1\n"
+ : "+r" (pix1), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "g" (h-2)
+ : "%ecx");
+ return tmp;
+}
+
+static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
+ int tmp;
+ uint8_t * pix= pix1;
+ __asm__ volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm7,%%mm7\n"
+ "pxor %%mm6,%%mm6\n"
+
+ "movq (%0),%%mm0\n"
+ "movq 1(%0),%%mm1\n"
+ "movq %%mm0, %%mm2\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm0\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm2\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm0\n"
+ "psubw %%mm3, %%mm2\n"
+
+ "add %2,%0\n"
+
+ "movq (%0),%%mm4\n"
+ "movq 1(%0),%%mm1\n"
+ "movq %%mm4, %%mm5\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm4\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm5\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm4\n"
+ "psubw %%mm3, %%mm5\n"
+ "psubw %%mm4, %%mm0\n"
+ "psubw %%mm5, %%mm2\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm0, %%mm3\n\t"
+ "pcmpgtw %%mm2, %%mm1\n\t"
+ "pxor %%mm3, %%mm0\n"
+ "pxor %%mm1, %%mm2\n"
+ "psubw %%mm3, %%mm0\n"
+ "psubw %%mm1, %%mm2\n"
+ "paddw %%mm0, %%mm2\n"
+ "paddw %%mm2, %%mm6\n"
+
+ "add %2,%0\n"
+ "1:\n"
+
+ "movq (%0),%%mm0\n"
+ "movq 1(%0),%%mm1\n"
+ "movq %%mm0, %%mm2\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm0\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm2\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm0\n"
+ "psubw %%mm3, %%mm2\n"
+ "psubw %%mm0, %%mm4\n"
+ "psubw %%mm2, %%mm5\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm4, %%mm3\n\t"
+ "pcmpgtw %%mm5, %%mm1\n\t"
+ "pxor %%mm3, %%mm4\n"
+ "pxor %%mm1, %%mm5\n"
+ "psubw %%mm3, %%mm4\n"
+ "psubw %%mm1, %%mm5\n"
+ "paddw %%mm4, %%mm5\n"
+ "paddw %%mm5, %%mm6\n"
+
+ "add %2,%0\n"
+
+ "movq (%0),%%mm4\n"
+ "movq 1(%0),%%mm1\n"
+ "movq %%mm4, %%mm5\n"
+ "movq %%mm1, %%mm3\n"
+ "punpcklbw %%mm7,%%mm4\n"
+ "punpcklbw %%mm7,%%mm1\n"
+ "punpckhbw %%mm7,%%mm5\n"
+ "punpckhbw %%mm7,%%mm3\n"
+ "psubw %%mm1, %%mm4\n"
+ "psubw %%mm3, %%mm5\n"
+ "psubw %%mm4, %%mm0\n"
+ "psubw %%mm5, %%mm2\n"
+ "pxor %%mm3, %%mm3\n"
+ "pxor %%mm1, %%mm1\n"
+ "pcmpgtw %%mm0, %%mm3\n\t"
+ "pcmpgtw %%mm2, %%mm1\n\t"
+ "pxor %%mm3, %%mm0\n"
+ "pxor %%mm1, %%mm2\n"
+ "psubw %%mm3, %%mm0\n"
+ "psubw %%mm1, %%mm2\n"
+ "paddw %%mm0, %%mm2\n"
+ "paddw %%mm2, %%mm6\n"
+
+ "add %2,%0\n"
+ "subl $2, %%ecx\n"
+ " jnz 1b\n"
+
+ "movq %%mm6, %%mm0\n"
+ "punpcklwd %%mm7,%%mm0\n"
+ "punpckhwd %%mm7,%%mm6\n"
+ "paddd %%mm0, %%mm6\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddd %%mm6,%%mm0\n"
+ "movd %%mm0,%1\n"
+ : "+r" (pix1), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "g" (h-2)
+ : "%ecx");
+ return tmp + hf_noise8_mmx(pix+8, line_size, h);
+}
+
+static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ MpegEncContext *c = p;
+ int score1, score2;
+
+ if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
+ else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
+ score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
+
+ if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
+ else return score1 + FFABS(score2)*8;
+}
+
+static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ MpegEncContext *c = p;
+ int score1= sse8_mmx(c, pix1, pix2, line_size, h);
+ int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
+
+ if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
+ else return score1 + FFABS(score2)*8;
+}
+
+static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0), %%mm2\n"\
+ "movq 8(%0), %%mm3\n"\
+ "add %2,%0\n"\
+ "movq %%mm2, " #out0 "\n"\
+ "movq %%mm3, " #out1 "\n"\
+ "psubusb " #in0 ", %%mm2\n"\
+ "psubusb " #in1 ", %%mm3\n"\
+ "psubusb " #out0 ", " #in0 "\n"\
+ "psubusb " #out1 ", " #in1 "\n"\
+ "por %%mm2, " #in0 "\n"\
+ "por %%mm3, " #in1 "\n"\
+ "movq " #in0 ", %%mm2\n"\
+ "movq " #in1 ", %%mm3\n"\
+ "punpcklbw %%mm7, " #in0 "\n"\
+ "punpcklbw %%mm7, " #in1 "\n"\
+ "punpckhbw %%mm7, %%mm2\n"\
+ "punpckhbw %%mm7, %%mm3\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw %%mm3, %%mm2\n"\
+ "paddw %%mm2, " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+
+ __asm__ volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pxor %%mm7,%%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq 8(%0),%%mm1\n"
+ "add %2,%0\n"
+ "jmp 2f\n"
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+ "2:\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddw %%mm6,%%mm0\n"
+ "movq %%mm0,%%mm6\n"
+ "psrlq $16, %%mm0\n"
+ "paddw %%mm6,%%mm0\n"
+ "movd %%mm0,%1\n"
+ : "+r" (pix), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp & 0xFFFF;
+}
+#undef SUM
+
+static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0), " #out0 "\n"\
+ "movq 8(%0), " #out1 "\n"\
+ "add %2,%0\n"\
+ "psadbw " #out0 ", " #in0 "\n"\
+ "psadbw " #out1 ", " #in1 "\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+ __asm__ volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pxor %%mm7,%%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq 8(%0),%%mm1\n"
+ "add %2,%0\n"
+ "jmp 2f\n"
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+ "2:\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movd %%mm6,%1\n"
+ : "+r" (pix), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+#undef SUM
+
+static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix1) & 7) == 0);
+ assert( (((int)pix2) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0),%%mm2\n"\
+ "movq (%1)," #out0 "\n"\
+ "movq 8(%0),%%mm3\n"\
+ "movq 8(%1)," #out1 "\n"\
+ "add %3,%0\n"\
+ "add %3,%1\n"\
+ "psubb " #out0 ", %%mm2\n"\
+ "psubb " #out1 ", %%mm3\n"\
+ "pxor %%mm7, %%mm2\n"\
+ "pxor %%mm7, %%mm3\n"\
+ "movq %%mm2, " #out0 "\n"\
+ "movq %%mm3, " #out1 "\n"\
+ "psubusb " #in0 ", %%mm2\n"\
+ "psubusb " #in1 ", %%mm3\n"\
+ "psubusb " #out0 ", " #in0 "\n"\
+ "psubusb " #out1 ", " #in1 "\n"\
+ "por %%mm2, " #in0 "\n"\
+ "por %%mm3, " #in1 "\n"\
+ "movq " #in0 ", %%mm2\n"\
+ "movq " #in1 ", %%mm3\n"\
+ "punpcklbw %%mm7, " #in0 "\n"\
+ "punpcklbw %%mm7, " #in1 "\n"\
+ "punpckhbw %%mm7, %%mm2\n"\
+ "punpckhbw %%mm7, %%mm3\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw %%mm3, %%mm2\n"\
+ "paddw %%mm2, " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+
+ __asm__ volatile (
+ "movl %4,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pcmpeqw %%mm7,%%mm7\n"
+ "psllw $15, %%mm7\n"
+ "packsswb %%mm7, %%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq (%1),%%mm2\n"
+ "movq 8(%0),%%mm1\n"
+ "movq 8(%1),%%mm3\n"
+ "add %3,%0\n"
+ "add %3,%1\n"
+ "psubb %%mm2, %%mm0\n"
+ "psubb %%mm3, %%mm1\n"
+ "pxor %%mm7, %%mm0\n"
+ "pxor %%mm7, %%mm1\n"
+ "jmp 2f\n"
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+ "2:\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddw %%mm6,%%mm0\n"
+ "movq %%mm0,%%mm6\n"
+ "psrlq $16, %%mm0\n"
+ "paddw %%mm6,%%mm0\n"
+ "movd %%mm0,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp & 0x7FFF;
+}
+#undef SUM
+
+static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix1) & 7) == 0);
+ assert( (((int)pix2) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0)," #out0 "\n"\
+ "movq (%1),%%mm2\n"\
+ "movq 8(%0)," #out1 "\n"\
+ "movq 8(%1),%%mm3\n"\
+ "add %3,%0\n"\
+ "add %3,%1\n"\
+ "psubb %%mm2, " #out0 "\n"\
+ "psubb %%mm3, " #out1 "\n"\
+ "pxor %%mm7, " #out0 "\n"\
+ "pxor %%mm7, " #out1 "\n"\
+ "psadbw " #out0 ", " #in0 "\n"\
+ "psadbw " #out1 ", " #in1 "\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+ __asm__ volatile (
+ "movl %4,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pcmpeqw %%mm7,%%mm7\n"
+ "psllw $15, %%mm7\n"
+ "packsswb %%mm7, %%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq (%1),%%mm2\n"
+ "movq 8(%0),%%mm1\n"
+ "movq 8(%1),%%mm3\n"
+ "add %3,%0\n"
+ "add %3,%1\n"
+ "psubb %%mm2, %%mm0\n"
+ "psubb %%mm3, %%mm1\n"
+ "pxor %%mm7, %%mm0\n"
+ "pxor %%mm7, %%mm1\n"
+ "jmp 2f\n"
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+ "2:\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movd %%mm6,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" ((x86_reg)line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+#undef SUM
+
+static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
+ x86_reg i=0;
+ __asm__ volatile(
+ "1: \n\t"
+ "movq (%2, %0), %%mm0 \n\t"
+ "movq (%1, %0), %%mm1 \n\t"
+ "psubb %%mm0, %%mm1 \n\t"
+ "movq %%mm1, (%3, %0) \n\t"
+ "movq 8(%2, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ "psubb %%mm0, %%mm1 \n\t"
+ "movq %%mm1, 8(%3, %0) \n\t"
+ "add $16, %0 \n\t"
+ "cmp %4, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (i)
+ : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
+ );
+ for(; i<w; i++)
+ dst[i+0] = src1[i+0]-src2[i+0];
+}
+
+static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
+ x86_reg i=0;
+ uint8_t l, lt;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "movq -1(%1, %0), %%mm0 \n\t" // LT
+ "movq (%1, %0), %%mm1 \n\t" // T
+ "movq -1(%2, %0), %%mm2 \n\t" // L
+ "movq (%2, %0), %%mm3 \n\t" // X
+ "movq %%mm2, %%mm4 \n\t" // L
+ "psubb %%mm0, %%mm2 \n\t"
+ "paddb %%mm1, %%mm2 \n\t" // L + T - LT
+ "movq %%mm4, %%mm5 \n\t" // L
+ "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
+ "pminub %%mm5, %%mm1 \n\t" // min(T, L)
+ "pminub %%mm2, %%mm4 \n\t"
+ "pmaxub %%mm1, %%mm4 \n\t"
+ "psubb %%mm4, %%mm3 \n\t" // dst - pred
+ "movq %%mm3, (%3, %0) \n\t"
+ "add $8, %0 \n\t"
+ "cmp %4, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (i)
+ : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
+ );
+
+ l= *left;
+ lt= *left_top;
+
+ dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
+
+ *left_top= src1[w-1];
+ *left = src2[w-1];
+}
+
+#define DIFF_PIXELS_1(m,a,t,p1,p2)\
+ "mov"#m" "#p1", "#a" \n\t"\
+ "mov"#m" "#p2", "#t" \n\t"\
+ "punpcklbw "#a", "#t" \n\t"\
+ "punpcklbw "#a", "#a" \n\t"\
+ "psubw "#t", "#a" \n\t"\
+
+#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
+ uint8_t *p1b=p1, *p2b=p2;\
+ __asm__ volatile(\
+ DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
+ DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
+ DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
+ "add %4, %1 \n\t"\
+ "add %4, %2 \n\t"\
+ DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
+ DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
+ DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
+ DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
+ "mov"#m1" "#mm"0, %0 \n\t"\
+ DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
+ "mov"#m1" %0, "#mm"0 \n\t"\
+ : "+m"(temp), "+r"(p1b), "+r"(p2b)\
+ : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
+ );\
+}
+ //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
+
+#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
+#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
+
+#define LBUTTERFLY2(a1,b1,a2,b2)\
+ "paddw " #b1 ", " #a1 " \n\t"\
+ "paddw " #b2 ", " #a2 " \n\t"\
+ "paddw " #b1 ", " #b1 " \n\t"\
+ "paddw " #b2 ", " #b2 " \n\t"\
+ "psubw " #a1 ", " #b1 " \n\t"\
+ "psubw " #a2 ", " #b2 " \n\t"
+
+#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
+ LBUTTERFLY2(m0, m1, m2, m3)\
+ LBUTTERFLY2(m4, m5, m6, m7)\
+ LBUTTERFLY2(m0, m2, m1, m3)\
+ LBUTTERFLY2(m4, m6, m5, m7)\
+ LBUTTERFLY2(m0, m4, m1, m5)\
+ LBUTTERFLY2(m2, m6, m3, m7)\
+
+#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
+
+#define MMABS_MMX(a,z)\
+ "pxor " #z ", " #z " \n\t"\
+ "pcmpgtw " #a ", " #z " \n\t"\
+ "pxor " #z ", " #a " \n\t"\
+ "psubw " #z ", " #a " \n\t"
+
+#define MMABS_MMX2(a,z)\
+ "pxor " #z ", " #z " \n\t"\
+ "psubw " #a ", " #z " \n\t"\
+ "pmaxsw " #z ", " #a " \n\t"
+
+#define MMABS_SSSE3(a,z)\
+ "pabsw " #a ", " #a " \n\t"
+
+#define MMABS_SUM(a,z, sum)\
+ MMABS(a,z)\
+ "paddusw " #a ", " #sum " \n\t"
+
+#define MMABS_SUM_8x8_NOSPILL\
+ MMABS(%%xmm0, %%xmm8)\
+ MMABS(%%xmm1, %%xmm9)\
+ MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
+ MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
+ MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
+ MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
+ MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
+ MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
+ "paddusw %%xmm1, %%xmm0 \n\t"
+
+#ifdef ARCH_X86_64
+#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
+#else
+#define MMABS_SUM_8x8_SSE2\
+ "movdqa %%xmm7, (%1) \n\t"\
+ MMABS(%%xmm0, %%xmm7)\
+ MMABS(%%xmm1, %%xmm7)\
+ MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
+ MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
+ MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
+ MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
+ MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
+ "movdqa (%1), %%xmm2 \n\t"\
+ MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
+ "paddusw %%xmm1, %%xmm0 \n\t"
+#endif
+
+/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
+ * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
+ * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
+#define HSUM_MMX(a, t, dst)\
+ "movq "#a", "#t" \n\t"\
+ "psrlq $32, "#a" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "movq "#a", "#t" \n\t"\
+ "psrlq $16, "#a" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "movd "#a", "#dst" \n\t"\
+
+#define HSUM_MMX2(a, t, dst)\
+ "pshufw $0x0E, "#a", "#t" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "pshufw $0x01, "#a", "#t" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "movd "#a", "#dst" \n\t"\
+
+#define HSUM_SSE2(a, t, dst)\
+ "movhlps "#a", "#t" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "pshuflw $0x0E, "#a", "#t" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "pshuflw $0x01, "#a", "#t" \n\t"\
+ "paddusw "#t", "#a" \n\t"\
+ "movd "#a", "#dst" \n\t"\
+
+#define HADAMARD8_DIFF_MMX(cpu) \
+static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
+ DECLARE_ALIGNED_8(uint64_t, temp[13]);\
+ int sum;\
+\
+ assert(h==8);\
+\
+ DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
+\
+ __asm__ volatile(\
+ HADAMARD48\
+\
+ "movq %%mm7, 96(%1) \n\t"\
+\
+ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
+ STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
+\
+ "movq 96(%1), %%mm7 \n\t"\
+ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
+ STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
+\
+ : "=r" (sum)\
+ : "r"(temp)\
+ );\
+\
+ DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
+\
+ __asm__ volatile(\
+ HADAMARD48\
+\
+ "movq %%mm7, 96(%1) \n\t"\
+\
+ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
+ STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
+\
+ "movq 96(%1), %%mm7 \n\t"\
+ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
+ "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
+ "movq %%mm6, %%mm7 \n\t"\
+ "movq %%mm0, %%mm6 \n\t"\
+\
+ LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
+\
+ HADAMARD48\
+ "movq %%mm7, 64(%1) \n\t"\
+ MMABS(%%mm0, %%mm7)\
+ MMABS(%%mm1, %%mm7)\
+ MMABS_SUM(%%mm2, %%mm7, %%mm0)\
+ MMABS_SUM(%%mm3, %%mm7, %%mm1)\
+ MMABS_SUM(%%mm4, %%mm7, %%mm0)\
+ MMABS_SUM(%%mm5, %%mm7, %%mm1)\
+ MMABS_SUM(%%mm6, %%mm7, %%mm0)\
+ "movq 64(%1), %%mm2 \n\t"\
+ MMABS_SUM(%%mm2, %%mm7, %%mm1)\
+ "paddusw %%mm1, %%mm0 \n\t"\
+ "movq %%mm0, 64(%1) \n\t"\
+\
+ LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
+ LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
+\
+ HADAMARD48\
+ "movq %%mm7, (%1) \n\t"\
+ MMABS(%%mm0, %%mm7)\
+ MMABS(%%mm1, %%mm7)\
+ MMABS_SUM(%%mm2, %%mm7, %%mm0)\
+ MMABS_SUM(%%mm3, %%mm7, %%mm1)\
+ MMABS_SUM(%%mm4, %%mm7, %%mm0)\
+ MMABS_SUM(%%mm5, %%mm7, %%mm1)\
+ MMABS_SUM(%%mm6, %%mm7, %%mm0)\
+ "movq (%1), %%mm2 \n\t"\
+ MMABS_SUM(%%mm2, %%mm7, %%mm1)\
+ "paddusw 64(%1), %%mm0 \n\t"\
+ "paddusw %%mm1, %%mm0 \n\t"\
+\
+ HSUM(%%mm0, %%mm1, %0)\
+\
+ : "=r" (sum)\
+ : "r"(temp)\
+ );\
+ return sum&0xFFFF;\
+}\
+WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
+
+#define HADAMARD8_DIFF_SSE2(cpu) \
+static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
+ DECLARE_ALIGNED_16(uint64_t, temp[4]);\
+ int sum;\
+\
+ assert(h==8);\
+\
+ DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
+\
+ __asm__ volatile(\
+ HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
+ TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
+ HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
+ MMABS_SUM_8x8\
+ HSUM_SSE2(%%xmm0, %%xmm1, %0)\
+ : "=r" (sum)\
+ : "r"(temp)\
+ );\
+ return sum&0xFFFF;\
+}\
+WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
+
+#define MMABS(a,z) MMABS_MMX(a,z)
+#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
+HADAMARD8_DIFF_MMX(mmx)
+#undef MMABS
+#undef HSUM
+
+#define MMABS(a,z) MMABS_MMX2(a,z)
+#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
+#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
+HADAMARD8_DIFF_MMX(mmx2)
+HADAMARD8_DIFF_SSE2(sse2)
+#undef MMABS
+#undef MMABS_SUM_8x8
+#undef HSUM
+
+#ifdef HAVE_SSSE3
+#define MMABS(a,z) MMABS_SSSE3(a,z)
+#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
+HADAMARD8_DIFF_SSE2(ssse3)
+#undef MMABS
+#undef MMABS_SUM_8x8
+#endif
+
+#define DCT_SAD4(m,mm,o)\
+ "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
+ "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
+ "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
+ "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
+ MMABS_SUM(mm##2, mm##6, mm##0)\
+ MMABS_SUM(mm##3, mm##7, mm##1)\
+ MMABS_SUM(mm##4, mm##6, mm##0)\
+ MMABS_SUM(mm##5, mm##7, mm##1)\
+
+#define DCT_SAD_MMX\
+ "pxor %%mm0, %%mm0 \n\t"\
+ "pxor %%mm1, %%mm1 \n\t"\
+ DCT_SAD4(q, %%mm, 0)\
+ DCT_SAD4(q, %%mm, 8)\
+ DCT_SAD4(q, %%mm, 64)\
+ DCT_SAD4(q, %%mm, 72)\
+ "paddusw %%mm1, %%mm0 \n\t"\
+ HSUM(%%mm0, %%mm1, %0)
+
+#define DCT_SAD_SSE2\
+ "pxor %%xmm0, %%xmm0 \n\t"\
+ "pxor %%xmm1, %%xmm1 \n\t"\
+ DCT_SAD4(dqa, %%xmm, 0)\
+ DCT_SAD4(dqa, %%xmm, 64)\
+ "paddusw %%xmm1, %%xmm0 \n\t"\
+ HSUM(%%xmm0, %%xmm1, %0)
+
+#define DCT_SAD_FUNC(cpu) \
+static int sum_abs_dctelem_##cpu(DCTELEM *block){\
+ int sum;\
+ __asm__ volatile(\
+ DCT_SAD\
+ :"=r"(sum)\
+ :"r"(block)\
+ );\
+ return sum&0xFFFF;\
+}
+
+#define DCT_SAD DCT_SAD_MMX
+#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
+#define MMABS(a,z) MMABS_MMX(a,z)
+DCT_SAD_FUNC(mmx)
+#undef MMABS
+#undef HSUM
+
+#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
+#define MMABS(a,z) MMABS_MMX2(a,z)
+DCT_SAD_FUNC(mmx2)
+#undef HSUM
+#undef DCT_SAD
+
+#define DCT_SAD DCT_SAD_SSE2
+#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
+DCT_SAD_FUNC(sse2)
+#undef MMABS
+
+#ifdef HAVE_SSSE3
+#define MMABS(a,z) MMABS_SSSE3(a,z)
+DCT_SAD_FUNC(ssse3)
+#undef MMABS
+#endif
+#undef HSUM
+#undef DCT_SAD
+
+static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
+ int sum;
+ x86_reg i=size;
+ __asm__ volatile(
+ "pxor %%mm4, %%mm4 \n"
+ "1: \n"
+ "sub $8, %0 \n"
+ "movq (%2,%0), %%mm2 \n"
+ "movq (%3,%0,2), %%mm0 \n"
+ "movq 8(%3,%0,2), %%mm1 \n"
+ "punpckhbw %%mm2, %%mm3 \n"
+ "punpcklbw %%mm2, %%mm2 \n"
+ "psraw $8, %%mm3 \n"
+ "psraw $8, %%mm2 \n"
+ "psubw %%mm3, %%mm1 \n"
+ "psubw %%mm2, %%mm0 \n"
+ "pmaddwd %%mm1, %%mm1 \n"
+ "pmaddwd %%mm0, %%mm0 \n"
+ "paddd %%mm1, %%mm4 \n"
+ "paddd %%mm0, %%mm4 \n"
+ "jg 1b \n"
+ "movq %%mm4, %%mm3 \n"
+ "psrlq $32, %%mm3 \n"
+ "paddd %%mm3, %%mm4 \n"
+ "movd %%mm4, %1 \n"
+ :"+r"(i), "=r"(sum)
+ :"r"(pix1), "r"(pix2)
+ );
+ return sum;
+}
+
+#define PHADDD(a, t)\
+ "movq "#a", "#t" \n\t"\
+ "psrlq $32, "#a" \n\t"\
+ "paddd "#t", "#a" \n\t"
+/*
+ pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
+ pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
+ pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
+ */
+#define PMULHRW(x, y, s, o)\
+ "pmulhw " #s ", "#x " \n\t"\
+ "pmulhw " #s ", "#y " \n\t"\
+ "paddw " #o ", "#x " \n\t"\
+ "paddw " #o ", "#y " \n\t"\
+ "psraw $1, "#x " \n\t"\
+ "psraw $1, "#y " \n\t"
+#define DEF(x) x ## _mmx
+#define SET_RND MOVQ_WONE
+#define SCALE_OFFSET 1
+
+#include "dsputil_mmx_qns_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+
+#define DEF(x) x ## _3dnow
+#define SET_RND(x)
+#define SCALE_OFFSET 0
+#define PMULHRW(x, y, s, o)\
+ "pmulhrw " #s ", "#x " \n\t"\
+ "pmulhrw " #s ", "#y " \n\t"
+
+#include "dsputil_mmx_qns_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+
+#ifdef HAVE_SSSE3
+#undef PHADDD
+#define DEF(x) x ## _ssse3
+#define SET_RND(x)
+#define SCALE_OFFSET -1
+#define PHADDD(a, t)\
+ "pshufw $0x0E, "#a", "#t" \n\t"\
+ "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
+#define PMULHRW(x, y, s, o)\
+ "pmulhrsw " #s ", "#x " \n\t"\
+ "pmulhrsw " #s ", "#y " \n\t"
+
+#include "dsputil_mmx_qns_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+#undef PHADDD
+#endif //HAVE_SSSE3
+
+
+/* FLAC specific */
+void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
+ double *autoc);
+
+
+void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
+{
+ if (mm_flags & FF_MM_MMX) {
+ const int dct_algo = avctx->dct_algo;
+ if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
+ if(mm_flags & FF_MM_SSE2){
+ c->fdct = ff_fdct_sse2;
+ }else if(mm_flags & FF_MM_MMXEXT){
+ c->fdct = ff_fdct_mmx2;
+ }else{
+ c->fdct = ff_fdct_mmx;
+ }
+ }
+
+ c->get_pixels = get_pixels_mmx;
+ c->diff_pixels = diff_pixels_mmx;
+ c->pix_sum = pix_sum16_mmx;
+
+ c->diff_bytes= diff_bytes_mmx;
+ c->sum_abs_dctelem= sum_abs_dctelem_mmx;
+
+ c->hadamard8_diff[0]= hadamard8_diff16_mmx;
+ c->hadamard8_diff[1]= hadamard8_diff_mmx;
+
+ c->pix_norm1 = pix_norm1_mmx;
+ c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;
+ c->sse[1] = sse8_mmx;
+ c->vsad[4]= vsad_intra16_mmx;
+
+ c->nsse[0] = nsse16_mmx;
+ c->nsse[1] = nsse8_mmx;
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->vsad[0] = vsad16_mmx;
+ }
+
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->try_8x8basis= try_8x8basis_mmx;
+ }
+ c->add_8x8basis= add_8x8basis_mmx;
+
+ c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
+
+
+ if (mm_flags & FF_MM_MMXEXT) {
+ c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
+ c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
+ c->hadamard8_diff[1]= hadamard8_diff_mmx2;
+ c->vsad[4]= vsad_intra16_mmx2;
+
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->vsad[0] = vsad16_mmx2;
+ }
+
+ c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
+ }
+
+ if(mm_flags & FF_MM_SSE2){
+ c->get_pixels = get_pixels_sse2;
+ c->sum_abs_dctelem= sum_abs_dctelem_sse2;
+ c->hadamard8_diff[0]= hadamard8_diff16_sse2;
+ c->hadamard8_diff[1]= hadamard8_diff_sse2;
+ if (ENABLE_FLAC_ENCODER)
+ c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
+ }
+
+#ifdef HAVE_SSSE3
+ if(mm_flags & FF_MM_SSSE3){
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->try_8x8basis= try_8x8basis_ssse3;
+ }
+ c->add_8x8basis= add_8x8basis_ssse3;
+ c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
+ c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
+ c->hadamard8_diff[1]= hadamard8_diff_ssse3;
+ }
+#endif
+
+ if(mm_flags & FF_MM_3DNOW){
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->try_8x8basis= try_8x8basis_3dnow;
+ }
+ c->add_8x8basis= add_8x8basis_3dnow;
+ }
+ }
+
+ dsputil_init_pix_mmx(c, avctx);
+}
diff --git a/libavcodec/x86/fdct_mmx.c b/libavcodec/x86/fdct_mmx.c
new file mode 100644
index 0000000000..c87c1a78f3
--- /dev/null
+++ b/libavcodec/x86/fdct_mmx.c
@@ -0,0 +1,580 @@
+/*
+ * MMX optimized forward DCT
+ * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
+ * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
+ *
+ * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
+ *
+ * Intel Application Note AP-922 - fast, precise implementation of DCT
+ * http://developer.intel.com/vtune/cbts/appnotes.htm
+ *
+ * Also of inspiration:
+ * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
+ * Skal's fdct at http://skal.planet-d.net/coding/dct.html
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavcodec/dsputil.h"
+
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
+
+//////////////////////////////////////////////////////////////////////
+//
+// constants for the forward DCT
+// -----------------------------
+//
+// Be sure to check that your compiler is aligning all constants to QWORD
+// (8-byte) memory boundaries! Otherwise the unaligned memory access will
+// severely stall MMX execution.
+//
+//////////////////////////////////////////////////////////////////////
+
+#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
+#define SHIFT_FRW_COL BITS_FRW_ACC
+#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
+#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
+//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
+
+#define X8(x) x,x,x,x,x,x,x,x
+
+//concatenated table, for forward DCT transformation
+static const int16_t fdct_tg_all_16[24] ATTR_ALIGN(16) = {
+ X8(13036), // tg * (2<<16) + 0.5
+ X8(27146), // tg * (2<<16) + 0.5
+ X8(-21746) // tg * (2<<16) + 0.5
+};
+
+static const int16_t ocos_4_16[8] ATTR_ALIGN(16) = {
+ X8(23170) //cos * (2<<15) + 0.5
+};
+
+static const int16_t fdct_one_corr[8] ATTR_ALIGN(16) = { X8(1) };
+
+static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
+
+static struct
+{
+ const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
+} fdct_r_row_sse2 ATTR_ALIGN(16)=
+{{
+ RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
+}};
+//static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
+
+static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table
+ 16384, 16384, 22725, 19266,
+ 16384, 16384, 12873, 4520,
+ 21407, 8867, 19266, -4520,
+ -8867, -21407, -22725, -12873,
+ 16384, -16384, 12873, -22725,
+ -16384, 16384, 4520, 19266,
+ 8867, -21407, 4520, -12873,
+ 21407, -8867, 19266, -22725,
+
+ 22725, 22725, 31521, 26722,
+ 22725, 22725, 17855, 6270,
+ 29692, 12299, 26722, -6270,
+ -12299, -29692, -31521, -17855,
+ 22725, -22725, 17855, -31521,
+ -22725, 22725, 6270, 26722,
+ 12299, -29692, 6270, -17855,
+ 29692, -12299, 26722, -31521,
+
+ 21407, 21407, 29692, 25172,
+ 21407, 21407, 16819, 5906,
+ 27969, 11585, 25172, -5906,
+ -11585, -27969, -29692, -16819,
+ 21407, -21407, 16819, -29692,
+ -21407, 21407, 5906, 25172,
+ 11585, -27969, 5906, -16819,
+ 27969, -11585, 25172, -29692,
+
+ 19266, 19266, 26722, 22654,
+ 19266, 19266, 15137, 5315,
+ 25172, 10426, 22654, -5315,
+ -10426, -25172, -26722, -15137,
+ 19266, -19266, 15137, -26722,
+ -19266, 19266, 5315, 22654,
+ 10426, -25172, 5315, -15137,
+ 25172, -10426, 22654, -26722,
+
+ 16384, 16384, 22725, 19266,
+ 16384, 16384, 12873, 4520,
+ 21407, 8867, 19266, -4520,
+ -8867, -21407, -22725, -12873,
+ 16384, -16384, 12873, -22725,
+ -16384, 16384, 4520, 19266,
+ 8867, -21407, 4520, -12873,
+ 21407, -8867, 19266, -22725,
+
+ 19266, 19266, 26722, 22654,
+ 19266, 19266, 15137, 5315,
+ 25172, 10426, 22654, -5315,
+ -10426, -25172, -26722, -15137,
+ 19266, -19266, 15137, -26722,
+ -19266, 19266, 5315, 22654,
+ 10426, -25172, 5315, -15137,
+ 25172, -10426, 22654, -26722,
+
+ 21407, 21407, 29692, 25172,
+ 21407, 21407, 16819, 5906,
+ 27969, 11585, 25172, -5906,
+ -11585, -27969, -29692, -16819,
+ 21407, -21407, 16819, -29692,
+ -21407, 21407, 5906, 25172,
+ 11585, -27969, 5906, -16819,
+ 27969, -11585, 25172, -29692,
+
+ 22725, 22725, 31521, 26722,
+ 22725, 22725, 17855, 6270,
+ 29692, 12299, 26722, -6270,
+ -12299, -29692, -31521, -17855,
+ 22725, -22725, 17855, -31521,
+ -22725, 22725, 6270, 26722,
+ 12299, -29692, 6270, -17855,
+ 29692, -12299, 26722, -31521,
+};
+
+static struct
+{
+ const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
+} tab_frw_01234567_sse2 ATTR_ALIGN(16) =
+{{
+//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table
+#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
+ C4, C4, C5, C7, C2, C6, C3, -C7, \
+ -C4, C4, C7, C3, C6, -C2, C7, -C5, \
+ C4, -C4, C5, -C1, C2, -C6, C3, -C1,
+// c1..c7 * cos(pi/4) * 2^15
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+}};
+
+#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
+
+#define FDCT_COL(cpu, mm, mov)\
+static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
+{\
+ __asm__ volatile (\
+ #mov" 16(%0), %%"#mm"0 \n\t" \
+ #mov" 96(%0), %%"#mm"1 \n\t" \
+ #mov" %%"#mm"0, %%"#mm"2 \n\t" \
+ #mov" 32(%0), %%"#mm"3 \n\t" \
+ "paddsw %%"#mm"1, %%"#mm"0 \n\t" \
+ #mov" 80(%0), %%"#mm"4 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
+ #mov" (%0), %%"#mm"5 \n\t" \
+ "paddsw %%"#mm"3, %%"#mm"4 \n\t" \
+ "paddsw 112(%0), %%"#mm"5 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
+ #mov" %%"#mm"0, %%"#mm"6 \n\t" \
+ "psubsw %%"#mm"1, %%"#mm"2 \n\t" \
+ #mov" 16(%1), %%"#mm"1 \n\t" \
+ "psubsw %%"#mm"4, %%"#mm"0 \n\t" \
+ #mov" 48(%0), %%"#mm"7 \n\t" \
+ "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
+ "paddsw 64(%0), %%"#mm"7 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
+ "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
+ #mov" %%"#mm"5, %%"#mm"4 \n\t" \
+ "psubsw %%"#mm"7, %%"#mm"5 \n\t" \
+ "paddsw %%"#mm"5, %%"#mm"1 \n\t" \
+ "paddsw %%"#mm"7, %%"#mm"4 \n\t" \
+ "por (%2), %%"#mm"1 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
+ "pmulhw 16(%1), %%"#mm"5 \n\t" \
+ #mov" %%"#mm"4, %%"#mm"7 \n\t" \
+ "psubsw 80(%0), %%"#mm"3 \n\t" \
+ "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
+ #mov" %%"#mm"1, 32(%3) \n\t" \
+ "paddsw %%"#mm"6, %%"#mm"7 \n\t" \
+ #mov" 48(%0), %%"#mm"1 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
+ "psubsw 64(%0), %%"#mm"1 \n\t" \
+ #mov" %%"#mm"2, %%"#mm"6 \n\t" \
+ #mov" %%"#mm"4, 64(%3) \n\t" \
+ "paddsw %%"#mm"3, %%"#mm"2 \n\t" \
+ "pmulhw (%4), %%"#mm"2 \n\t" \
+ "psubsw %%"#mm"3, %%"#mm"6 \n\t" \
+ "pmulhw (%4), %%"#mm"6 \n\t" \
+ "psubsw %%"#mm"0, %%"#mm"5 \n\t" \
+ "por (%2), %%"#mm"5 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
+ "por (%2), %%"#mm"2 \n\t" \
+ #mov" %%"#mm"1, %%"#mm"4 \n\t" \
+ #mov" (%0), %%"#mm"3 \n\t" \
+ "paddsw %%"#mm"6, %%"#mm"1 \n\t" \
+ "psubsw 112(%0), %%"#mm"3 \n\t" \
+ "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
+ #mov" (%1), %%"#mm"0 \n\t" \
+ "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
+ #mov" 32(%1), %%"#mm"6 \n\t" \
+ "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
+ #mov" %%"#mm"7, (%3) \n\t" \
+ "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
+ #mov" %%"#mm"5, 96(%3) \n\t" \
+ #mov" %%"#mm"3, %%"#mm"7 \n\t" \
+ #mov" 32(%1), %%"#mm"5 \n\t" \
+ "psubsw %%"#mm"2, %%"#mm"7 \n\t" \
+ "paddsw %%"#mm"2, %%"#mm"3 \n\t" \
+ "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
+ "paddsw %%"#mm"3, %%"#mm"0 \n\t" \
+ "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
+ "pmulhw (%1), %%"#mm"3 \n\t" \
+ "por (%2), %%"#mm"0 \n\t" \
+ "paddsw %%"#mm"7, %%"#mm"5 \n\t" \
+ "psubsw %%"#mm"6, %%"#mm"7 \n\t" \
+ #mov" %%"#mm"0, 16(%3) \n\t" \
+ "paddsw %%"#mm"4, %%"#mm"5 \n\t" \
+ #mov" %%"#mm"7, 48(%3) \n\t" \
+ "psubsw %%"#mm"1, %%"#mm"3 \n\t" \
+ #mov" %%"#mm"5, 80(%3) \n\t" \
+ #mov" %%"#mm"3, 112(%3) \n\t" \
+ : \
+ : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
+ "r" (out + offset), "r" (ocos_4_16)); \
+}
+
+FDCT_COL(mmx, mm, movq)
+FDCT_COL(sse2, xmm, movdqa)
+
+static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
+{
+ __asm__ volatile(
+#define FDCT_ROW_SSE2_H1(i,t) \
+ "movq " #i "(%0), %%xmm2 \n\t" \
+ "movq " #i "+8(%0), %%xmm0 \n\t" \
+ "movdqa " #t "+32(%1), %%xmm3 \n\t" \
+ "movdqa " #t "+48(%1), %%xmm7 \n\t" \
+ "movdqa " #t "(%1), %%xmm4 \n\t" \
+ "movdqa " #t "+16(%1), %%xmm5 \n\t"
+
+#define FDCT_ROW_SSE2_H2(i,t) \
+ "movq " #i "(%0), %%xmm2 \n\t" \
+ "movq " #i "+8(%0), %%xmm0 \n\t" \
+ "movdqa " #t "+32(%1), %%xmm3 \n\t" \
+ "movdqa " #t "+48(%1), %%xmm7 \n\t"
+
+#define FDCT_ROW_SSE2(i) \
+ "movq %%xmm2, %%xmm1 \n\t" \
+ "pshuflw $27, %%xmm0, %%xmm0 \n\t" \
+ "paddsw %%xmm0, %%xmm1 \n\t" \
+ "psubsw %%xmm0, %%xmm2 \n\t" \
+ "punpckldq %%xmm2, %%xmm1 \n\t" \
+ "pshufd $78, %%xmm1, %%xmm2 \n\t" \
+ "pmaddwd %%xmm2, %%xmm3 \n\t" \
+ "pmaddwd %%xmm1, %%xmm7 \n\t" \
+ "pmaddwd %%xmm5, %%xmm2 \n\t" \
+ "pmaddwd %%xmm4, %%xmm1 \n\t" \
+ "paddd %%xmm7, %%xmm3 \n\t" \
+ "paddd %%xmm2, %%xmm1 \n\t" \
+ "paddd %%xmm6, %%xmm3 \n\t" \
+ "paddd %%xmm6, %%xmm1 \n\t" \
+ "psrad %3, %%xmm3 \n\t" \
+ "psrad %3, %%xmm1 \n\t" \
+ "packssdw %%xmm3, %%xmm1 \n\t" \
+ "movdqa %%xmm1, " #i "(%4) \n\t"
+
+ "movdqa (%2), %%xmm6 \n\t"
+ FDCT_ROW_SSE2_H1(0,0)
+ FDCT_ROW_SSE2(0)
+ FDCT_ROW_SSE2_H2(64,0)
+ FDCT_ROW_SSE2(64)
+
+ FDCT_ROW_SSE2_H1(16,64)
+ FDCT_ROW_SSE2(16)
+ FDCT_ROW_SSE2_H2(112,64)
+ FDCT_ROW_SSE2(112)
+
+ FDCT_ROW_SSE2_H1(32,128)
+ FDCT_ROW_SSE2(32)
+ FDCT_ROW_SSE2_H2(96,128)
+ FDCT_ROW_SSE2(96)
+
+ FDCT_ROW_SSE2_H1(48,192)
+ FDCT_ROW_SSE2(48)
+ FDCT_ROW_SSE2_H2(80,192)
+ FDCT_ROW_SSE2(80)
+ :
+ : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
+ );
+}
+
+static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
+{
+ __asm__ volatile (
+ "pshufw $0x1B, 8(%0), %%mm5 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "paddsw %%mm5, %%mm0 \n\t"
+ "psubsw %%mm5, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "punpckldq %%mm1, %%mm0 \n\t"
+ "punpckhdq %%mm1, %%mm2 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm3 \n\t"
+ "movq 16(%1), %%mm4 \n\t"
+ "movq 24(%1), %%mm5 \n\t"
+ "movq 32(%1), %%mm6 \n\t"
+ "movq 40(%1), %%mm7 \n\t"
+ "pmaddwd %%mm0, %%mm1 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+ "pmaddwd %%mm0, %%mm4 \n\t"
+ "pmaddwd %%mm2, %%mm5 \n\t"
+ "pmaddwd %%mm0, %%mm6 \n\t"
+ "pmaddwd %%mm2, %%mm7 \n\t"
+ "pmaddwd 48(%1), %%mm0 \n\t"
+ "pmaddwd 56(%1), %%mm2 \n\t"
+ "paddd %%mm1, %%mm3 \n\t"
+ "paddd %%mm4, %%mm5 \n\t"
+ "paddd %%mm6, %%mm7 \n\t"
+ "paddd %%mm0, %%mm2 \n\t"
+ "movq (%2), %%mm0 \n\t"
+ "paddd %%mm0, %%mm3 \n\t"
+ "paddd %%mm0, %%mm5 \n\t"
+ "paddd %%mm0, %%mm7 \n\t"
+ "paddd %%mm0, %%mm2 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
+ "packssdw %%mm5, %%mm3 \n\t"
+ "packssdw %%mm2, %%mm7 \n\t"
+ "movq %%mm3, (%3) \n\t"
+ "movq %%mm7, 8(%3) \n\t"
+ :
+ : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
+}
+
+static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
+{
+ //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
+ __asm__ volatile(
+ "movd 12(%0), %%mm1 \n\t"
+ "punpcklwd 8(%0), %%mm1 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ "psrlq $0x20, %%mm1 \n\t"
+ "movq 0(%0), %%mm0 \n\t"
+ "punpcklwd %%mm2, %%mm1 \n\t"
+ "movq %%mm0, %%mm5 \n\t"
+ "paddsw %%mm1, %%mm0 \n\t"
+ "psubsw %%mm1, %%mm5 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "punpckldq %%mm5, %%mm0 \n\t"
+ "punpckhdq %%mm5, %%mm2 \n\t"
+ "movq 0(%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm3 \n\t"
+ "movq 16(%1), %%mm4 \n\t"
+ "movq 24(%1), %%mm5 \n\t"
+ "movq 32(%1), %%mm6 \n\t"
+ "movq 40(%1), %%mm7 \n\t"
+ "pmaddwd %%mm0, %%mm1 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+ "pmaddwd %%mm0, %%mm4 \n\t"
+ "pmaddwd %%mm2, %%mm5 \n\t"
+ "pmaddwd %%mm0, %%mm6 \n\t"
+ "pmaddwd %%mm2, %%mm7 \n\t"
+ "pmaddwd 48(%1), %%mm0 \n\t"
+ "pmaddwd 56(%1), %%mm2 \n\t"
+ "paddd %%mm1, %%mm3 \n\t"
+ "paddd %%mm4, %%mm5 \n\t"
+ "paddd %%mm6, %%mm7 \n\t"
+ "paddd %%mm0, %%mm2 \n\t"
+ "movq (%2), %%mm0 \n\t"
+ "paddd %%mm0, %%mm3 \n\t"
+ "paddd %%mm0, %%mm5 \n\t"
+ "paddd %%mm0, %%mm7 \n\t"
+ "paddd %%mm0, %%mm2 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
+ "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
+ "packssdw %%mm5, %%mm3 \n\t"
+ "packssdw %%mm2, %%mm7 \n\t"
+ "movq %%mm3, 0(%3) \n\t"
+ "movq %%mm7, 8(%3) \n\t"
+ :
+ : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
+}
+
+void ff_fdct_mmx(int16_t *block)
+{
+ int64_t align_tmp[16] ATTR_ALIGN(8);
+ int16_t * block1= (int16_t*)align_tmp;
+ const int16_t *table= tab_frw_01234567;
+ int i;
+
+ fdct_col_mmx(block, block1, 0);
+ fdct_col_mmx(block, block1, 4);
+
+ for(i=8;i>0;i--) {
+ fdct_row_mmx(block1, block, table);
+ block1 += 8;
+ table += 32;
+ block += 8;
+ }
+}
+
+void ff_fdct_mmx2(int16_t *block)
+{
+ int64_t align_tmp[16] ATTR_ALIGN(8);
+ int16_t *block1= (int16_t*)align_tmp;
+ const int16_t *table= tab_frw_01234567;
+ int i;
+
+ fdct_col_mmx(block, block1, 0);
+ fdct_col_mmx(block, block1, 4);
+
+ for(i=8;i>0;i--) {
+ fdct_row_mmx2(block1, block, table);
+ block1 += 8;
+ table += 32;
+ block += 8;
+ }
+}
+
+void ff_fdct_sse2(int16_t *block)
+{
+ int64_t align_tmp[16] ATTR_ALIGN(16);
+ int16_t * const block1= (int16_t*)align_tmp;
+
+ fdct_col_sse2(block, block1, 0);
+ fdct_row_sse2(block1, block);
+}
+
diff --git a/libavcodec/x86/fft_3dn.c b/libavcodec/x86/fft_3dn.c
new file mode 100644
index 0000000000..6f2e2e8353
--- /dev/null
+++ b/libavcodec/x86/fft_3dn.c
@@ -0,0 +1,23 @@
+/*
+ * FFT/MDCT transform with 3DNow! optimizations
+ * Copyright (c) 2008 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define EMULATE_3DNOWEXT
+#include "fft_3dn2.c"
diff --git a/libavcodec/x86/fft_3dn2.c b/libavcodec/x86/fft_3dn2.c
new file mode 100644
index 0000000000..1f30edc99d
--- /dev/null
+++ b/libavcodec/x86/fft_3dn2.c
@@ -0,0 +1,173 @@
+/*
+ * FFT/MDCT transform with Extended 3DNow! optimizations
+ * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+
+DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 };
+
+#ifdef EMULATE_3DNOWEXT
+#define PSWAPD(s,d)\
+ "movq "#s","#d"\n"\
+ "psrlq $32,"#d"\n"\
+ "punpckldq "#s","#d"\n"
+#define ff_fft_calc_3dn2 ff_fft_calc_3dn
+#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
+#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
+#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
+#define ff_imdct_half_3dn2 ff_imdct_half_3dn
+#else
+#define PSWAPD(s,d) "pswapd "#s","#d"\n"
+#endif
+
+void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
+void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits);
+
+void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
+{
+ int n = 1<<s->nbits;
+ int i;
+ ff_fft_dispatch_interleave_3dn2(z, s->nbits);
+ __asm__ volatile("femms");
+ if(n <= 8)
+ for(i=0; i<n; i+=2)
+ FFSWAP(FFTSample, z[i].im, z[i+1].re);
+}
+
+void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input)
+{
+ x86_reg j, k;
+ long n = 1 << s->nbits;
+ long n2 = n >> 1;
+ long n4 = n >> 2;
+ long n8 = n >> 3;
+ const uint16_t *revtab = s->fft.revtab;
+ const FFTSample *tcos = s->tcos;
+ const FFTSample *tsin = s->tsin;
+ const FFTSample *in1, *in2;
+ FFTComplex *z = (FFTComplex *)output;
+
+ /* pre rotation */
+ in1 = input;
+ in2 = input + n2 - 1;
+#ifdef EMULATE_3DNOWEXT
+ __asm__ volatile("movd %0, %%mm7" ::"r"(1<<31));
+#endif
+ for(k = 0; k < n4; k++) {
+ // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
+ __asm__ volatile(
+ "movd %0, %%mm0 \n"
+ "movd %2, %%mm1 \n"
+ "punpckldq %1, %%mm0 \n"
+ "punpckldq %3, %%mm1 \n"
+ "movq %%mm0, %%mm2 \n"
+ PSWAPD( %%mm1, %%mm3 )
+ "pfmul %%mm1, %%mm0 \n"
+ "pfmul %%mm3, %%mm2 \n"
+#ifdef EMULATE_3DNOWEXT
+ "movq %%mm0, %%mm1 \n"
+ "punpckhdq %%mm2, %%mm0 \n"
+ "punpckldq %%mm2, %%mm1 \n"
+ "pxor %%mm7, %%mm0 \n"
+ "pfadd %%mm1, %%mm0 \n"
+#else
+ "pfpnacc %%mm2, %%mm0 \n"
+#endif
+ ::"m"(in2[-2*k]), "m"(in1[2*k]),
+ "m"(tcos[k]), "m"(tsin[k])
+ );
+ __asm__ volatile(
+ "movq %%mm0, %0 \n\t"
+ :"=m"(z[revtab[k]])
+ );
+ }
+
+ ff_fft_dispatch_3dn2(z, s->fft.nbits);
+
+#define CMUL(j,mm0,mm1)\
+ "movq (%2,"#j",2), %%mm6 \n"\
+ "movq 8(%2,"#j",2), "#mm0"\n"\
+ "movq %%mm6, "#mm1"\n"\
+ "movq "#mm0",%%mm7 \n"\
+ "pfmul (%3,"#j"), %%mm6 \n"\
+ "pfmul (%4,"#j"), "#mm0"\n"\
+ "pfmul (%4,"#j"), "#mm1"\n"\
+ "pfmul (%3,"#j"), %%mm7 \n"\
+ "pfsub %%mm6, "#mm0"\n"\
+ "pfadd %%mm7, "#mm1"\n"
+
+ /* post rotation */
+ j = -n2;
+ k = n2-8;
+ __asm__ volatile(
+ "1: \n"
+ CMUL(%0, %%mm0, %%mm1)
+ CMUL(%1, %%mm2, %%mm3)
+ "movd %%mm0, (%2,%0,2) \n"
+ "movd %%mm1,12(%2,%1,2) \n"
+ "movd %%mm2, (%2,%1,2) \n"
+ "movd %%mm3,12(%2,%0,2) \n"
+ "psrlq $32, %%mm0 \n"
+ "psrlq $32, %%mm1 \n"
+ "psrlq $32, %%mm2 \n"
+ "psrlq $32, %%mm3 \n"
+ "movd %%mm0, 8(%2,%0,2) \n"
+ "movd %%mm1, 4(%2,%1,2) \n"
+ "movd %%mm2, 8(%2,%1,2) \n"
+ "movd %%mm3, 4(%2,%0,2) \n"
+ "sub $8, %1 \n"
+ "add $8, %0 \n"
+ "jl 1b \n"
+ :"+r"(j), "+r"(k)
+ :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
+ :"memory"
+ );
+ __asm__ volatile("femms");
+}
+
+void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input)
+{
+ x86_reg j, k;
+ long n = 1 << s->nbits;
+ long n4 = n >> 2;
+
+ ff_imdct_half_3dn2(s, output+n4, input);
+
+ j = -n;
+ k = n-8;
+ __asm__ volatile(
+ "movq %4, %%mm7 \n"
+ "1: \n"
+ PSWAPD((%2,%1), %%mm0)
+ PSWAPD((%3,%0), %%mm1)
+ "pxor %%mm7, %%mm0 \n"
+ "movq %%mm1, (%3,%1) \n"
+ "movq %%mm0, (%2,%0) \n"
+ "sub $8, %1 \n"
+ "add $8, %0 \n"
+ "jl 1b \n"
+ :"+r"(j), "+r"(k)
+ :"r"(output+n4), "r"(output+n4*3),
+ "m"(*m1m1)
+ );
+ __asm__ volatile("femms");
+}
+
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
new file mode 100644
index 0000000000..39718677ae
--- /dev/null
+++ b/libavcodec/x86/fft_mmx.asm
@@ -0,0 +1,467 @@
+;******************************************************************************
+;* FFT transform with SSE/3DNow optimizations
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+; These functions are not individually interchangeable with the C versions.
+; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
+; in blocks as conventient to the vector size.
+; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+
+%define M_SQRT1_2 0.70710678118654752440
+ps_root2: times 4 dd M_SQRT1_2
+ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+ps_m1p1: dd 1<<31, 0
+
+%assign i 16
+%rep 13
+cextern ff_cos_ %+ i
+%assign i i<<1
+%endrep
+
+%ifdef ARCH_X86_64
+ %define pointer dq
+%else
+ %define pointer dd
+%endif
+
+%macro IF0 1+
+%endmacro
+%macro IF1 1+
+ %1
+%endmacro
+
+section .text align=16
+
+%macro T2_3DN 4 ; z0, z1, mem0, mem1
+ mova %1, %3
+ mova %2, %1
+ pfadd %1, %4
+ pfsub %2, %4
+%endmacro
+
+%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
+ mova %5, %3
+ pfsub %3, %4
+ pfadd %5, %4 ; {t6,t5}
+ pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7}
+ mova %6, %1
+ pswapd %3, %3
+ pfadd %1, %5 ; {r0,i0}
+ pfsub %6, %5 ; {r2,i2}
+ mova %4, %2
+ pfadd %2, %3 ; {r1,i1}
+ pfsub %4, %3 ; {r3,i3}
+ SWAP %3, %6
+%endmacro
+
+; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
+; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
+%macro T4_SSE 3
+ mova %3, %1
+ shufps %1, %2, 0x64 ; {r0,i0,r3,i2}
+ shufps %3, %2, 0xce ; {r1,i1,r2,i3}
+ mova %2, %1
+ addps %1, %3 ; {t1,t2,t6,t5}
+ subps %2, %3 ; {t3,t4,t8,t7}
+ mova %3, %1
+ shufps %1, %2, 0x44 ; {t1,t2,t3,t4}
+ shufps %3, %2, 0xbe ; {t6,t5,t7,t8}
+ mova %2, %1
+ addps %1, %3 ; {r0,i0,r1,i1}
+ subps %2, %3 ; {r2,i2,r3,i3}
+ mova %3, %1
+ shufps %1, %2, 0x88 ; {r0,r1,r2,r3}
+ shufps %3, %2, 0xdd ; {i0,i1,i2,i3}
+ SWAP %2, %3
+%endmacro
+
+%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
+ mova %5, %3
+ shufps %3, %4, 0x44 ; {r4,i4,r6,i6}
+ shufps %5, %4, 0xee ; {r5,i5,r7,i7}
+ mova %6, %3
+ subps %3, %5 ; {r5,i5,r7,i7}
+ addps %6, %5 ; {t1,t2,t3,t4}
+ mova %5, %3
+ shufps %5, %5, 0xb1 ; {i5,r5,i7,r7}
+ mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7}
+ mulps %5, [ps_root2 GLOBAL]
+ addps %3, %5 ; {t8,t7,ta,t9}
+ mova %5, %6
+ shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
+ shufps %5, %3, 0x9c ; {t1,t4,t7,ta}
+ mova %3, %6
+ addps %6, %5 ; {t1,t2,t9,ta}
+ subps %3, %5 ; {t6,t5,tc,tb}
+ mova %5, %6
+ shufps %6, %3, 0xd8 ; {t1,t9,t5,tb}
+ shufps %5, %3, 0x8d ; {t2,ta,t6,tc}
+ mova %3, %1
+ mova %4, %2
+ addps %1, %6 ; {r0,r1,r2,r3}
+ addps %2, %5 ; {i0,i1,i2,i3}
+ subps %3, %6 ; {r4,r5,r6,r7}
+ subps %4, %5 ; {i4,i5,i6,i7}
+%endmacro
+
+; scheduled for cpu-bound sizes
+%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
+IF%1 mova m4, Z(4)
+IF%1 mova m5, Z(5)
+ mova m0, %2 ; wre
+ mova m2, m4
+ mova m1, %3 ; wim
+ mova m3, m5
+ mulps m2, m0 ; r2*wre
+IF%1 mova m6, Z(6)
+ mulps m3, m1 ; i2*wim
+IF%1 mova m7, Z(7)
+ mulps m4, m1 ; r2*wim
+ mulps m5, m0 ; i2*wre
+ addps m2, m3 ; r2*wre + i2*wim
+ mova m3, m1
+ mulps m1, m6 ; r3*wim
+ subps m5, m4 ; i2*wre - r2*wim
+ mova m4, m0
+ mulps m3, m7 ; i3*wim
+ mulps m4, m6 ; r3*wre
+ mulps m0, m7 ; i3*wre
+ subps m4, m3 ; r3*wre - i3*wim
+ mova m3, Z(0)
+ addps m0, m1 ; i3*wre + r3*wim
+ mova m1, m4
+ addps m4, m2 ; t5
+ subps m1, m2 ; t3
+ subps m3, m4 ; r2
+ addps m4, Z(0) ; r0
+ mova m6, Z(2)
+ mova Z(4), m3
+ mova Z(0), m4
+ mova m3, m5
+ subps m5, m0 ; t4
+ mova m4, m6
+ subps m6, m5 ; r3
+ addps m5, m4 ; r1
+ mova Z(6), m6
+ mova Z(2), m5
+ mova m2, Z(3)
+ addps m3, m0 ; t6
+ subps m2, m1 ; i3
+ mova m7, Z(1)
+ addps m1, Z(3) ; i1
+ mova Z(7), m2
+ mova Z(3), m1
+ mova m4, m7
+ subps m7, m3 ; i2
+ addps m3, m4 ; i0
+ mova Z(5), m7
+ mova Z(1), m3
+%endmacro
+
+; scheduled to avoid store->load aliasing
+%macro PASS_BIG 1 ; (!interleave)
+ mova m4, Z(4) ; r2
+ mova m5, Z(5) ; i2
+ mova m2, m4
+ mova m0, [wq] ; wre
+ mova m3, m5
+ mova m1, [wq+o1q] ; wim
+ mulps m2, m0 ; r2*wre
+ mova m6, Z(6) ; r3
+ mulps m3, m1 ; i2*wim
+ mova m7, Z(7) ; i3
+ mulps m4, m1 ; r2*wim
+ mulps m5, m0 ; i2*wre
+ addps m2, m3 ; r2*wre + i2*wim
+ mova m3, m1
+ mulps m1, m6 ; r3*wim
+ subps m5, m4 ; i2*wre - r2*wim
+ mova m4, m0
+ mulps m3, m7 ; i3*wim
+ mulps m4, m6 ; r3*wre
+ mulps m0, m7 ; i3*wre
+ subps m4, m3 ; r3*wre - i3*wim
+ mova m3, Z(0)
+ addps m0, m1 ; i3*wre + r3*wim
+ mova m1, m4
+ addps m4, m2 ; t5
+ subps m1, m2 ; t3
+ subps m3, m4 ; r2
+ addps m4, Z(0) ; r0
+ mova m6, Z(2)
+ mova Z(4), m3
+ mova Z(0), m4
+ mova m3, m5
+ subps m5, m0 ; t4
+ mova m4, m6
+ subps m6, m5 ; r3
+ addps m5, m4 ; r1
+IF%1 mova Z(6), m6
+IF%1 mova Z(2), m5
+ mova m2, Z(3)
+ addps m3, m0 ; t6
+ subps m2, m1 ; i3
+ mova m7, Z(1)
+ addps m1, Z(3) ; i1
+IF%1 mova Z(7), m2
+IF%1 mova Z(3), m1
+ mova m4, m7
+ subps m7, m3 ; i2
+ addps m3, m4 ; i0
+IF%1 mova Z(5), m7
+IF%1 mova Z(1), m3
+%if %1==0
+ mova m4, m5 ; r1
+ mova m0, m6 ; r3
+ unpcklps m5, m1
+ unpckhps m4, m1
+ unpcklps m6, m2
+ unpckhps m0, m2
+ mova m1, Z(0)
+ mova m2, Z(4)
+ mova Z(2), m5
+ mova Z(3), m4
+ mova Z(6), m6
+ mova Z(7), m0
+ mova m5, m1 ; r0
+ mova m4, m2 ; r2
+ unpcklps m1, m3
+ unpckhps m5, m3
+ unpcklps m2, m7
+ unpckhps m4, m7
+ mova Z(0), m1
+ mova Z(1), m5
+ mova Z(4), m2
+ mova Z(5), m4
+%endif
+%endmacro
+
+%macro PUNPCK 3
+ mova %3, %1
+ punpckldq %1, %2
+ punpckhdq %3, %2
+%endmacro
+
+INIT_XMM
+
+%define Z(x) [r0+mmsize*x]
+
+align 16
+fft4_sse:
+ mova m0, Z(0)
+ mova m1, Z(1)
+ T4_SSE m0, m1, m2
+ mova Z(0), m0
+ mova Z(1), m1
+ ret
+
+align 16
+fft8_sse:
+ mova m0, Z(0)
+ mova m1, Z(1)
+ T4_SSE m0, m1, m2
+ mova m2, Z(2)
+ mova m3, Z(3)
+ T8_SSE m0, m1, m2, m3, m4, m5
+ mova Z(0), m0
+ mova Z(1), m1
+ mova Z(2), m2
+ mova Z(3), m3
+ ret
+
+align 16
+fft16_sse:
+ mova m0, Z(0)
+ mova m1, Z(1)
+ T4_SSE m0, m1, m2
+ mova m2, Z(2)
+ mova m3, Z(3)
+ T8_SSE m0, m1, m2, m3, m4, m5
+ mova m4, Z(4)
+ mova m5, Z(5)
+ mova Z(0), m0
+ mova Z(1), m1
+ mova Z(2), m2
+ mova Z(3), m3
+ T4_SSE m4, m5, m6
+ mova m6, Z(6)
+ mova m7, Z(7)
+ T4_SSE m6, m7, m0
+ PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL]
+ ret
+
+
+INIT_MMX
+
+%macro FFT48_3DN 1
+align 16
+fft4%1:
+ T2_3DN m0, m1, Z(0), Z(1)
+ mova m2, Z(2)
+ mova m3, Z(3)
+ T4_3DN m0, m1, m2, m3, m4, m5
+ PUNPCK m0, m1, m4
+ PUNPCK m2, m3, m5
+ mova Z(0), m0
+ mova Z(1), m4
+ mova Z(2), m2
+ mova Z(3), m5
+ ret
+
+align 16
+fft8%1:
+ T2_3DN m0, m1, Z(0), Z(1)
+ mova m2, Z(2)
+ mova m3, Z(3)
+ T4_3DN m0, m1, m2, m3, m4, m5
+ mova Z(0), m0
+ mova Z(2), m2
+ T2_3DN m4, m5, Z(4), Z(5)
+ T2_3DN m6, m7, Z(6), Z(7)
+ pswapd m0, m5
+ pswapd m2, m7
+ pxor m0, [ps_m1p1 GLOBAL]
+ pxor m2, [ps_m1p1 GLOBAL]
+ pfsub m5, m0
+ pfadd m7, m2
+ pfmul m5, [ps_root2 GLOBAL]
+ pfmul m7, [ps_root2 GLOBAL]
+ T4_3DN m1, m3, m5, m7, m0, m2
+ mova Z(5), m5
+ mova Z(7), m7
+ mova m0, Z(0)
+ mova m2, Z(2)
+ T4_3DN m0, m2, m4, m6, m5, m7
+ PUNPCK m0, m1, m5
+ PUNPCK m2, m3, m7
+ mova Z(0), m0
+ mova Z(1), m5
+ mova Z(2), m2
+ mova Z(3), m7
+ PUNPCK m4, Z(5), m5
+ PUNPCK m6, Z(7), m7
+ mova Z(4), m4
+ mova Z(5), m5
+ mova Z(6), m6
+ mova Z(7), m7
+ ret
+%endmacro
+
+FFT48_3DN _3dn2
+
+%macro pswapd 2
+%ifidn %1, %2
+ movd [r0+12], %1
+ punpckhdq %1, [r0+8]
+%else
+ movq %1, %2
+ psrlq %1, 32
+ punpckldq %1, %2
+%endif
+%endmacro
+
+FFT48_3DN _3dn
+
+
+%define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
+
+%macro DECL_PASS 2+ ; name, payload
+align 16
+%1:
+DEFINE_ARGS z, w, n, o1, o3
+ lea o3q, [nq*3]
+ lea o1q, [nq*8]
+ shl o3q, 4
+.loop:
+ %2
+ add zq, mmsize*2
+ add wq, mmsize
+ sub nd, mmsize/8
+ jg .loop
+ rep ret
+%endmacro
+
+INIT_XMM
+DECL_PASS pass_sse, PASS_BIG 1
+DECL_PASS pass_interleave_sse, PASS_BIG 0
+
+INIT_MMX
+%define mulps pfmul
+%define addps pfadd
+%define subps pfsub
+%define unpcklps punpckldq
+%define unpckhps punpckhdq
+DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
+DECL_PASS pass_interleave_3dn, PASS_BIG 0
+%define pass_3dn2 pass_3dn
+%define pass_interleave_3dn2 pass_interleave_3dn
+
+
+%macro DECL_FFT 2-3 ; nbits, cpu, suffix
+%xdefine list_of_fft fft4%2, fft8%2
+%if %1==5
+%xdefine list_of_fft list_of_fft, fft16%2
+%endif
+
+%assign n 1<<%1
+%rep 17-%1
+%assign n2 n/2
+%assign n4 n/4
+%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2
+
+align 16
+fft %+ n %+ %3%2:
+ call fft %+ n2 %+ %2
+ add r0, n*4 - (n&(-2<<%1))
+ call fft %+ n4 %+ %2
+ add r0, n*2 - (n2&(-2<<%1))
+ call fft %+ n4 %+ %2
+ sub r0, n*6 + (n2&(-2<<%1))
+ lea r1, [ff_cos_ %+ n GLOBAL]
+ mov r2d, n4/2
+ jmp pass%3%2
+
+%assign n n*2
+%endrep
+%undef n
+
+align 8
+dispatch_tab%3%2: pointer list_of_fft
+
+; On x86_32, this function does the register saving and restoring for all of fft.
+; The others pass args in registers and don't spill anything.
+cglobal fft_dispatch%3%2, 2,5,0, z, nbits
+ lea r2, [dispatch_tab%3%2 GLOBAL]
+ mov r2, [r2 + (nbitsq-2)*gprsize]
+ call r2
+ RET
+%endmacro ; DECL_FFT
+
+DECL_FFT 5, _sse
+DECL_FFT 5, _sse, _interleave
+DECL_FFT 4, _3dn
+DECL_FFT 4, _3dn, _interleave
+DECL_FFT 4, _3dn2
+DECL_FFT 4, _3dn2, _interleave
+
diff --git a/libavcodec/x86/fft_sse.c b/libavcodec/x86/fft_sse.c
new file mode 100644
index 0000000000..deced3b929
--- /dev/null
+++ b/libavcodec/x86/fft_sse.c
@@ -0,0 +1,202 @@
+/*
+ * FFT/MDCT transform with SSE optimizations
+ * Copyright (c) 2008 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+
+static const int m1m1m1m1[4] __attribute__((aligned(16))) =
+ { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
+
+void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
+void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
+
+void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
+{
+ int n = 1 << s->nbits;
+
+ ff_fft_dispatch_interleave_sse(z, s->nbits);
+
+ if(n <= 16) {
+ x86_reg i = -8*n;
+ __asm__ volatile(
+ "1: \n"
+ "movaps (%0,%1), %%xmm0 \n"
+ "movaps %%xmm0, %%xmm1 \n"
+ "unpcklps 16(%0,%1), %%xmm0 \n"
+ "unpckhps 16(%0,%1), %%xmm1 \n"
+ "movaps %%xmm0, (%0,%1) \n"
+ "movaps %%xmm1, 16(%0,%1) \n"
+ "add $32, %0 \n"
+ "jl 1b \n"
+ :"+r"(i)
+ :"r"(z+n)
+ :"memory"
+ );
+ }
+}
+
+void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
+{
+ int n = 1 << s->nbits;
+ int i;
+ for(i=0; i<n; i+=2) {
+ __asm__ volatile(
+ "movaps %2, %%xmm0 \n"
+ "movlps %%xmm0, %0 \n"
+ "movhps %%xmm0, %1 \n"
+ :"=m"(s->tmp_buf[s->revtab[i]]),
+ "=m"(s->tmp_buf[s->revtab[i+1]])
+ :"m"(z[i])
+ );
+ }
+ memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
+}
+
+void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
+{
+ av_unused x86_reg i, j, k, l;
+ long n = 1 << s->nbits;
+ long n2 = n >> 1;
+ long n4 = n >> 2;
+ long n8 = n >> 3;
+ const uint16_t *revtab = s->fft.revtab + n8;
+ const FFTSample *tcos = s->tcos;
+ const FFTSample *tsin = s->tsin;
+ FFTComplex *z = (FFTComplex *)output;
+
+ /* pre rotation */
+ for(k=n8-2; k>=0; k-=2) {
+ __asm__ volatile(
+ "movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1].re, z[k+1].im }
+ "movaps -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im }
+ "movaps %%xmm0, %%xmm2 \n"
+ "shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re }
+ "shufps $0x77, %%xmm2, %%xmm1 \n" // { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im }
+ "movlps (%3,%1), %%xmm4 \n"
+ "movlps (%4,%1), %%xmm5 \n"
+ "movhps -8(%3,%0), %%xmm4 \n" // { cos[k], cos[k+1], cos[-k-2], cos[-k-1] }
+ "movhps -8(%4,%0), %%xmm5 \n" // { sin[k], sin[k+1], sin[-k-2], sin[-k-1] }
+ "movaps %%xmm0, %%xmm2 \n"
+ "movaps %%xmm1, %%xmm3 \n"
+ "mulps %%xmm5, %%xmm0 \n" // re*sin
+ "mulps %%xmm4, %%xmm1 \n" // im*cos
+ "mulps %%xmm4, %%xmm2 \n" // re*cos
+ "mulps %%xmm5, %%xmm3 \n" // im*sin
+ "subps %%xmm0, %%xmm1 \n" // -> re
+ "addps %%xmm3, %%xmm2 \n" // -> im
+ "movaps %%xmm1, %%xmm0 \n"
+ "unpcklps %%xmm2, %%xmm1 \n" // { z[k], z[k+1] }
+ "unpckhps %%xmm2, %%xmm0 \n" // { z[-k-2], z[-k-1] }
+ ::"r"(-4*k), "r"(4*k),
+ "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)
+ );
+#ifdef ARCH_X86_64
+ // if we have enough regs, don't let gcc make the luts latency-bound
+ // but if not, latency is faster than spilling
+ __asm__("movlps %%xmm0, %0 \n"
+ "movhps %%xmm0, %1 \n"
+ "movlps %%xmm1, %2 \n"
+ "movhps %%xmm1, %3 \n"
+ :"=m"(z[revtab[-k-2]]),
+ "=m"(z[revtab[-k-1]]),
+ "=m"(z[revtab[ k ]]),
+ "=m"(z[revtab[ k+1]])
+ );
+#else
+ __asm__("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]]));
+ __asm__("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]]));
+ __asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]]));
+ __asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]]));
+#endif
+ }
+
+ ff_fft_dispatch_sse(z, s->fft.nbits);
+
+ /* post rotation + reinterleave + reorder */
+
+#define CMUL(j,xmm0,xmm1)\
+ "movaps (%2,"#j",2), %%xmm6 \n"\
+ "movaps 16(%2,"#j",2), "#xmm0"\n"\
+ "movaps %%xmm6, "#xmm1"\n"\
+ "movaps "#xmm0",%%xmm7 \n"\
+ "mulps (%3,"#j"), %%xmm6 \n"\
+ "mulps (%4,"#j"), "#xmm0"\n"\
+ "mulps (%4,"#j"), "#xmm1"\n"\
+ "mulps (%3,"#j"), %%xmm7 \n"\
+ "subps %%xmm6, "#xmm0"\n"\
+ "addps %%xmm7, "#xmm1"\n"
+
+ j = -n2;
+ k = n2-16;
+ __asm__ volatile(
+ "1: \n"
+ CMUL(%0, %%xmm0, %%xmm1)
+ CMUL(%1, %%xmm4, %%xmm5)
+ "shufps $0x1b, %%xmm1, %%xmm1 \n"
+ "shufps $0x1b, %%xmm5, %%xmm5 \n"
+ "movaps %%xmm4, %%xmm6 \n"
+ "unpckhps %%xmm1, %%xmm4 \n"
+ "unpcklps %%xmm1, %%xmm6 \n"
+ "movaps %%xmm0, %%xmm2 \n"
+ "unpcklps %%xmm5, %%xmm0 \n"
+ "unpckhps %%xmm5, %%xmm2 \n"
+ "movaps %%xmm6, (%2,%1,2) \n"
+ "movaps %%xmm4, 16(%2,%1,2) \n"
+ "movaps %%xmm0, (%2,%0,2) \n"
+ "movaps %%xmm2, 16(%2,%0,2) \n"
+ "sub $16, %1 \n"
+ "add $16, %0 \n"
+ "jl 1b \n"
+ :"+&r"(j), "+&r"(k)
+ :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
+ :"memory"
+ );
+}
+
+void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
+{
+ x86_reg j, k;
+ long n = 1 << s->nbits;
+ long n4 = n >> 2;
+
+ ff_imdct_half_sse(s, output+n4, input);
+
+ j = -n;
+ k = n-16;
+ __asm__ volatile(
+ "movaps %4, %%xmm7 \n"
+ "1: \n"
+ "movaps (%2,%1), %%xmm0 \n"
+ "movaps (%3,%0), %%xmm1 \n"
+ "shufps $0x1b, %%xmm0, %%xmm0 \n"
+ "shufps $0x1b, %%xmm1, %%xmm1 \n"
+ "xorps %%xmm7, %%xmm0 \n"
+ "movaps %%xmm1, (%3,%1) \n"
+ "movaps %%xmm0, (%2,%0) \n"
+ "sub $16, %1 \n"
+ "add $16, %0 \n"
+ "jl 1b \n"
+ :"+r"(j), "+r"(k)
+ :"r"(output+n4), "r"(output+n4*3),
+ "m"(*m1m1m1m1)
+ );
+}
+
diff --git a/libavcodec/x86/flacdsp_mmx.c b/libavcodec/x86/flacdsp_mmx.c
new file mode 100644
index 0000000000..01c0d7ae8a
--- /dev/null
+++ b/libavcodec/x86/flacdsp_mmx.c
@@ -0,0 +1,139 @@
+/*
+ * MMX optimized FLAC DSP utils
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "dsputil_mmx.h"
+
+static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
+{
+ double c = 2.0 / (len-1.0);
+ int n2 = len>>1;
+ x86_reg i = -n2*sizeof(int32_t);
+ x86_reg j = n2*sizeof(int32_t);
+ __asm__ volatile(
+ "movsd %0, %%xmm7 \n\t"
+ "movapd "MANGLE(ff_pd_1)", %%xmm6 \n\t"
+ "movapd "MANGLE(ff_pd_2)", %%xmm5 \n\t"
+ "movlhps %%xmm7, %%xmm7 \n\t"
+ "subpd %%xmm5, %%xmm7 \n\t"
+ "addsd %%xmm6, %%xmm7 \n\t"
+ ::"m"(c)
+ );
+#define WELCH(MOVPD, offset)\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movapd %%xmm7, %%xmm1 \n\t"\
+ "mulpd %%xmm1, %%xmm1 \n\t"\
+ "movapd %%xmm6, %%xmm0 \n\t"\
+ "subpd %%xmm1, %%xmm0 \n\t"\
+ "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
+ "cvtpi2pd (%3,%0), %%xmm2 \n\t"\
+ "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
+ "mulpd %%xmm0, %%xmm2 \n\t"\
+ "mulpd %%xmm1, %%xmm3 \n\t"\
+ "movapd %%xmm2, (%2,%0,2) \n\t"\
+ MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
+ "subpd %%xmm5, %%xmm7 \n\t"\
+ "sub $8, %1 \n\t"\
+ "add $8, %0 \n\t"\
+ "jl 1b \n\t"\
+ :"+&r"(i), "+&r"(j)\
+ :"r"(w_data+n2), "r"(data+n2)\
+ );
+ if(len&1)
+ WELCH("movupd", -1)
+ else
+ WELCH("movapd", -2)
+#undef WELCH
+}
+
+void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
+ double *autoc)
+{
+ double tmp[len + lag + 2];
+ double *data1 = tmp + lag;
+ int j;
+
+ if((x86_reg)data1 & 15)
+ data1++;
+
+ apply_welch_window_sse2(data, len, data1);
+
+ for(j=0; j<lag; j++)
+ data1[j-lag]= 0.0;
+ data1[len] = 0.0;
+
+ for(j=0; j<lag; j+=2){
+ x86_reg i = -len*sizeof(double);
+ if(j == lag-2) {
+ __asm__ volatile(
+ "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t"
+ "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t"
+ "movsd "MANGLE(ff_pd_1)", %%xmm2 \n\t"
+ "1: \n\t"
+ "movapd (%4,%0), %%xmm3 \n\t"
+ "movupd -8(%5,%0), %%xmm4 \n\t"
+ "movapd (%5,%0), %%xmm5 \n\t"
+ "mulpd %%xmm3, %%xmm4 \n\t"
+ "mulpd %%xmm3, %%xmm5 \n\t"
+ "mulpd -16(%5,%0), %%xmm3 \n\t"
+ "addpd %%xmm4, %%xmm1 \n\t"
+ "addpd %%xmm5, %%xmm0 \n\t"
+ "addpd %%xmm3, %%xmm2 \n\t"
+ "add $16, %0 \n\t"
+ "jl 1b \n\t"
+ "movhlps %%xmm0, %%xmm3 \n\t"
+ "movhlps %%xmm1, %%xmm4 \n\t"
+ "movhlps %%xmm2, %%xmm5 \n\t"
+ "addsd %%xmm3, %%xmm0 \n\t"
+ "addsd %%xmm4, %%xmm1 \n\t"
+ "addsd %%xmm5, %%xmm2 \n\t"
+ "movsd %%xmm0, %1 \n\t"
+ "movsd %%xmm1, %2 \n\t"
+ "movsd %%xmm2, %3 \n\t"
+ :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2])
+ :"r"(data1+len), "r"(data1+len-j)
+ );
+ } else {
+ __asm__ volatile(
+ "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t"
+ "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t"
+ "1: \n\t"
+ "movapd (%3,%0), %%xmm3 \n\t"
+ "movupd -8(%4,%0), %%xmm4 \n\t"
+ "mulpd %%xmm3, %%xmm4 \n\t"
+ "mulpd (%4,%0), %%xmm3 \n\t"
+ "addpd %%xmm4, %%xmm1 \n\t"
+ "addpd %%xmm3, %%xmm0 \n\t"
+ "add $16, %0 \n\t"
+ "jl 1b \n\t"
+ "movhlps %%xmm0, %%xmm3 \n\t"
+ "movhlps %%xmm1, %%xmm4 \n\t"
+ "addsd %%xmm3, %%xmm0 \n\t"
+ "addsd %%xmm4, %%xmm1 \n\t"
+ "movsd %%xmm0, %1 \n\t"
+ "movsd %%xmm1, %2 \n\t"
+ :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
+ :"r"(data1+len), "r"(data1+len-j)
+ );
+ }
+ }
+}
diff --git a/libavcodec/x86/h264_deblock_sse2.asm b/libavcodec/x86/h264_deblock_sse2.asm
new file mode 100644
index 0000000000..d59de919be
--- /dev/null
+++ b/libavcodec/x86/h264_deblock_sse2.asm
@@ -0,0 +1,747 @@
+;*****************************************************************************
+;* deblock-a.asm: h264 encoder library
+;*****************************************************************************
+;* Copyright (C) 2005-2008 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*****************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+pb_00: times 16 db 0x00
+pb_01: times 16 db 0x01
+pb_03: times 16 db 0x03
+pb_a1: times 16 db 0xa1
+
+SECTION .text
+
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+ [base], [base+stride], [base+stride*2], [base3], \
+ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
+
+; in: 8 rows of 4 bytes in %1..%8
+; out: 4 rows of 8 bytes in m0..m3
+%macro TRANSPOSE4x8_LOAD 8
+ movd m0, %1
+ movd m2, %2
+ movd m1, %3
+ movd m3, %4
+ punpcklbw m0, m2
+ punpcklbw m1, m3
+ movq m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+
+ movd m4, %5
+ movd m6, %6
+ movd m5, %7
+ movd m7, %8
+ punpcklbw m4, m6
+ punpcklbw m5, m7
+ movq m6, m4
+ punpcklwd m4, m5
+ punpckhwd m6, m5
+
+ movq m1, m0
+ movq m3, m2
+ punpckldq m0, m4
+ punpckhdq m1, m4
+ punpckldq m2, m6
+ punpckhdq m3, m6
+%endmacro
+
+; in: 4 rows of 8 bytes in m0..m3
+; out: 8 rows of 4 bytes in %1..%8
+%macro TRANSPOSE8x4_STORE 8
+ movq m4, m0
+ movq m5, m1
+ movq m6, m2
+ punpckhdq m4, m4
+ punpckhdq m5, m5
+ punpckhdq m6, m6
+
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ movq m1, m0
+ punpcklwd m0, m2
+ punpckhwd m1, m2
+ movd %1, m0
+ punpckhdq m0, m0
+ movd %2, m0
+ movd %3, m1
+ punpckhdq m1, m1
+ movd %4, m1
+
+ punpckhdq m3, m3
+ punpcklbw m4, m5
+ punpcklbw m6, m3
+ movq m5, m4
+ punpcklwd m4, m6
+ punpckhwd m5, m6
+ movd %5, m4
+ punpckhdq m4, m4
+ movd %6, m4
+ movd %7, m5
+ punpckhdq m5, m5
+ movd %8, m5
+%endmacro
+
+%macro SBUTTERFLY 4
+ movq %4, %2
+ punpckl%1 %2, %3
+ punpckh%1 %4, %3
+%endmacro
+
+; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
+; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
+%macro TRANSPOSE6x8_MEM 9
+ movq m0, %1
+ movq m1, %2
+ movq m2, %3
+ movq m3, %4
+ movq m4, %5
+ movq m5, %6
+ movq m6, %7
+ SBUTTERFLY bw, m0, m1, m7
+ SBUTTERFLY bw, m2, m3, m1
+ SBUTTERFLY bw, m4, m5, m3
+ movq [%9+0x10], m1
+ SBUTTERFLY bw, m6, %8, m5
+ SBUTTERFLY wd, m0, m2, m1
+ SBUTTERFLY wd, m4, m6, m2
+ punpckhdq m0, m4
+ movq [%9+0x00], m0
+ SBUTTERFLY wd, m7, [%9+0x10], m6
+ SBUTTERFLY wd, m3, m5, m4
+ SBUTTERFLY dq, m7, m3, m0
+ SBUTTERFLY dq, m1, m2, m5
+ punpckldq m6, m4
+ movq [%9+0x10], m1
+ movq [%9+0x20], m5
+ movq [%9+0x30], m7
+ movq [%9+0x40], m0
+ movq [%9+0x50], m6
+%endmacro
+
+; in: 8 rows of 8 in %1..%8
+; out: 8 rows of 8 in %9..%16
+%macro TRANSPOSE8x8_MEM 16
+ movq m0, %1
+ movq m1, %2
+ movq m2, %3
+ movq m3, %4
+ movq m4, %5
+ movq m5, %6
+ movq m6, %7
+ SBUTTERFLY bw, m0, m1, m7
+ SBUTTERFLY bw, m2, m3, m1
+ SBUTTERFLY bw, m4, m5, m3
+ SBUTTERFLY bw, m6, %8, m5
+ movq %9, m3
+ SBUTTERFLY wd, m0, m2, m3
+ SBUTTERFLY wd, m4, m6, m2
+ SBUTTERFLY wd, m7, m1, m6
+ movq %11, m2
+ movq m2, %9
+ SBUTTERFLY wd, m2, m5, m1
+ SBUTTERFLY dq, m0, m4, m5
+ SBUTTERFLY dq, m7, m2, m4
+ movq %9, m0
+ movq %10, m5
+ movq %13, m7
+ movq %14, m4
+ SBUTTERFLY dq, m3, %11, m0
+ SBUTTERFLY dq, m6, m1, m5
+ movq %11, m3
+ movq %12, m0
+ movq %15, m6
+ movq %16, m5
+%endmacro
+
+; out: %4 = |%1-%2|>%3
+; clobbers: %5
+%macro DIFF_GT 5
+ mova %5, %2
+ mova %4, %1
+ psubusb %5, %1
+ psubusb %4, %2
+ por %4, %5
+ psubusb %4, %3
+%endmacro
+
+; out: %4 = |%1-%2|>%3
+; clobbers: %5
+%macro DIFF_GT2 5
+ mova %5, %2
+ mova %4, %1
+ psubusb %5, %1
+ psubusb %4, %2
+ psubusb %5, %3
+ psubusb %4, %3
+ pcmpeqb %4, %5
+%endmacro
+
+%macro SPLATW 1
+%ifidn m0, xmm0
+ pshuflw %1, %1, 0
+ punpcklqdq %1, %1
+%else
+ pshufw %1, %1, 0
+%endif
+%endmacro
+
+; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
+; out: m5=beta-1, m7=mask, %3=alpha-1
+; clobbers: m4,m6
+%macro LOAD_MASK 2-3
+ movd m4, %1
+ movd m5, %2
+ SPLATW m4
+ SPLATW m5
+ packuswb m4, m4 ; 16x alpha-1
+ packuswb m5, m5 ; 16x beta-1
+%if %0>2
+ mova %3, m4
+%endif
+ DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
+ DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
+ por m7, m4
+ DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
+ por m7, m4
+ pxor m6, m6
+ pcmpeqb m7, m6
+%endmacro
+
+; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
+; out: m1=p0' m2=q0'
+; clobbers: m0,3-6
+%macro DEBLOCK_P0_Q0 0
+ mova m5, m1
+ pxor m5, m2 ; p0^q0
+ pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
+ pcmpeqb m4, m4
+ pxor m3, m4
+ pavgb m3, m0 ; (p1 - q1 + 256)>>1
+ pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
+ pxor m4, m1
+ pavgb m4, m2 ; (q0 - p0 + 256)>>1
+ pavgb m3, m5
+ paddusb m3, m4 ; d+128+33
+ mova m6, [pb_a1 GLOBAL]
+ psubusb m6, m3
+ psubusb m3, [pb_a1 GLOBAL]
+ pminub m6, m7
+ pminub m3, m7
+ psubusb m1, m6
+ psubusb m2, m3
+ paddusb m1, m3
+ paddusb m2, m6
+%endmacro
+
+; in: m1=p0 m2=q0
+; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
+; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
+; clobbers: q2, tmp, tc0
+%macro LUMA_Q1 6
+ mova %6, m1
+ pavgb %6, m2
+ pavgb %2, %6 ; avg(p2,avg(p0,q0))
+ pxor %6, %3
+ pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
+ psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
+ mova %6, %1
+ psubusb %6, %5
+ paddusb %5, %1
+ pmaxub %2, %6
+ pminub %2, %5
+ mova %4, %2
+%endmacro
+
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+INIT_XMM
+cglobal x264_deblock_v_luma_sse2
+ movd m8, [r4] ; tc0
+ lea r4, [r1*3]
+ dec r2d ; alpha-1
+ neg r4
+ dec r3d ; beta-1
+ add r4, r0 ; pix-3*stride
+
+ mova m0, [r4+r1] ; p1
+ mova m1, [r4+2*r1] ; p0
+ mova m2, [r0] ; q0
+ mova m3, [r0+r1] ; q1
+ LOAD_MASK r2d, r3d
+
+ punpcklbw m8, m8
+ punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
+ pcmpeqb m9, m9
+ pcmpeqb m9, m8
+ pandn m9, m7
+ pand m8, m9
+
+ movdqa m3, [r4] ; p2
+ DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+ pand m6, m9
+ mova m7, m8
+ psubb m7, m6
+ pand m6, m8
+ LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+
+ movdqa m4, [r0+2*r1] ; q2
+ DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+ pand m6, m9
+ pand m8, m6
+ psubb m7, m6
+ mova m3, [r0+r1]
+ LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
+
+ DEBLOCK_P0_Q0
+ mova [r4+2*r1], m1
+ mova [r0], m2
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_deblock_h_luma_sse2
+ movsxd r10, esi
+ lea r11, [r10+r10*2]
+ lea rax, [r0-4]
+ lea r9, [r0-4+r11]
+ sub rsp, 0x68
+ %define pix_tmp rsp
+
+ ; transpose 6x16 -> tmp space
+ TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
+ lea rax, [rax+r10*8]
+ lea r9, [r9 +r10*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
+
+ ; vertical filter
+ ; alpha, beta, tc0 are still in r2d, r3d, r4
+ ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+ lea r0, [pix_tmp+0x30]
+ mov esi, 0x10
+ call x264_deblock_v_luma_sse2
+
+ ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
+ add rax, 2
+ add r9, 2
+ movq m0, [pix_tmp+0x18]
+ movq m1, [pix_tmp+0x28]
+ movq m2, [pix_tmp+0x38]
+ movq m3, [pix_tmp+0x48]
+ TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
+
+ shl r10, 3
+ sub rax, r10
+ sub r9, r10
+ shr r10, 3
+ movq m0, [pix_tmp+0x10]
+ movq m1, [pix_tmp+0x20]
+ movq m2, [pix_tmp+0x30]
+ movq m3, [pix_tmp+0x40]
+ TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
+
+ add rsp, 0x68
+ ret
+
+%else
+
+%macro DEBLOCK_LUMA 3
+;-----------------------------------------------------------------------------
+; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_%2_luma_%1, 5,5
+ lea r4, [r1*3]
+ dec r2 ; alpha-1
+ neg r4
+ dec r3 ; beta-1
+ add r4, r0 ; pix-3*stride
+ %assign pad 2*%3+12-(stack_offset&15)
+ SUB esp, pad
+
+ mova m0, [r4+r1] ; p1
+ mova m1, [r4+2*r1] ; p0
+ mova m2, [r0] ; q0
+ mova m3, [r0+r1] ; q1
+ LOAD_MASK r2, r3
+
+ mov r3, r4m
+ movd m4, [r3] ; tc0
+ punpcklbw m4, m4
+ punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
+ mova [esp+%3], m4 ; tc
+ pcmpeqb m3, m3
+ pcmpgtb m4, m3
+ pand m4, m7
+ mova [esp], m4 ; mask
+
+ mova m3, [r4] ; p2
+ DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
+ pand m6, m4
+ pand m4, [esp+%3] ; tc
+ mova m7, m4
+ psubb m7, m6
+ pand m6, m4
+ LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
+
+ mova m4, [r0+2*r1] ; q2
+ DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
+ mova m5, [esp] ; mask
+ pand m6, m5
+ mova m5, [esp+%3] ; tc
+ pand m5, m6
+ psubb m7, m6
+ mova m3, [r0+r1]
+ LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
+
+ DEBLOCK_P0_Q0
+ mova [r4+2*r1], m1
+ mova [r0], m2
+ ADD esp, pad
+ RET
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_deblock_h_luma_%1, 0,5
+ mov r0, r0m
+ mov r3, r1m
+ lea r4, [r3*3]
+ sub r0, 4
+ lea r1, [r0+r4]
+ %assign pad 0x78-(stack_offset&15)
+ SUB esp, pad
+%define pix_tmp esp+12
+
+ ; transpose 6x16 -> tmp space
+ TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
+ lea r0, [r0+r3*8]
+ lea r1, [r1+r3*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
+
+ ; vertical filter
+ lea r0, [pix_tmp+0x30]
+ PUSH dword r4m
+ PUSH dword r3m
+ PUSH dword r2m
+ PUSH dword 16
+ PUSH dword r0
+ call x264_deblock_%2_luma_%1
+%ifidn %2, v8
+ add dword [esp ], 8 ; pix_tmp+0x38
+ add dword [esp+16], 2 ; tc0+2
+ call x264_deblock_%2_luma_%1
+%endif
+ ADD esp, 20
+
+ ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
+ mov r0, r0m
+ sub r0, 2
+ lea r1, [r0+r4]
+
+ movq m0, [pix_tmp+0x10]
+ movq m1, [pix_tmp+0x20]
+ movq m2, [pix_tmp+0x30]
+ movq m3, [pix_tmp+0x40]
+ TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
+
+ lea r0, [r0+r3*8]
+ lea r1, [r1+r3*8]
+ movq m0, [pix_tmp+0x18]
+ movq m1, [pix_tmp+0x28]
+ movq m2, [pix_tmp+0x38]
+ movq m3, [pix_tmp+0x48]
+ TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
+
+ ADD esp, pad
+ RET
+%endmacro ; DEBLOCK_LUMA
+
+INIT_XMM
+DEBLOCK_LUMA sse2, v, 16
+
+%endif ; ARCH
+
+
+
+%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
+ mova t0, p2
+ mova t1, p0
+ pavgb t0, p1
+ pavgb t1, q0
+ pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
+ mova t5, t1
+ mova t2, p2
+ mova t3, p0
+ paddb t2, p1
+ paddb t3, q0
+ paddb t2, t3
+ mova t3, t2
+ mova t4, t2
+ psrlw t2, 1
+ pavgb t2, mpb_00
+ pxor t2, t0
+ pand t2, mpb_01
+ psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
+
+ mova t1, p2
+ mova t2, p2
+ pavgb t1, q1
+ psubb t2, q1
+ paddb t3, t3
+ psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
+ pand t2, mpb_01
+ psubb t1, t2
+ pavgb t1, p1
+ pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
+ psrlw t3, 2
+ pavgb t3, mpb_00
+ pxor t3, t1
+ pand t3, mpb_01
+ psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
+
+ mova t3, p0
+ mova t2, p0
+ pxor t3, q1
+ pavgb t2, q1
+ pand t3, mpb_01
+ psubb t2, t3
+ pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
+
+ pxor t1, t2
+ pxor t2, p0
+ pand t1, mask1p
+ pand t2, mask0
+ pxor t1, t2
+ pxor t1, p0
+ mova %1, t1 ; store p0
+
+ mova t1, %4 ; p3
+ mova t2, t1
+ pavgb t1, p2
+ paddb t2, p2
+ pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
+ paddb t2, t2
+ paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
+ psrlw t2, 2
+ pavgb t2, mpb_00
+ pxor t2, t1
+ pand t2, mpb_01
+ psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
+
+ pxor t0, p1
+ pxor t1, p2
+ pand t0, mask1p
+ pand t1, mask1p
+ pxor t0, p1
+ pxor t1, p2
+ mova %2, t0 ; store p1
+ mova %3, t1 ; store p2
+%endmacro
+
+%macro LUMA_INTRA_SWAP_PQ 0
+ %define q1 m0
+ %define q0 m1
+ %define p0 m2
+ %define p1 m3
+ %define p2 q2
+ %define mask1p mask1q
+%endmacro
+
+%macro DEBLOCK_LUMA_INTRA 2
+ %define p1 m0
+ %define p0 m1
+ %define q0 m2
+ %define q1 m3
+ %define t0 m4
+ %define t1 m5
+ %define t2 m6
+ %define t3 m7
+%ifdef ARCH_X86_64
+ %define p2 m8
+ %define q2 m9
+ %define t4 m10
+ %define t5 m11
+ %define mask0 m12
+ %define mask1p m13
+ %define mask1q [rsp-24]
+ %define mpb_00 m14
+ %define mpb_01 m15
+%else
+ %define spill(x) [esp+16*x+((stack_offset+4)&15)]
+ %define p2 [r4+r1]
+ %define q2 [r0+2*r1]
+ %define t4 spill(0)
+ %define t5 spill(1)
+ %define mask0 spill(2)
+ %define mask1p spill(3)
+ %define mask1q spill(4)
+ %define mpb_00 [pb_00 GLOBAL]
+ %define mpb_01 [pb_01 GLOBAL]
+%endif
+
+;-----------------------------------------------------------------------------
+; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_%2_luma_intra_%1, 4,6
+%ifndef ARCH_X86_64
+ sub esp, 0x60
+%endif
+ lea r4, [r1*4]
+ lea r5, [r1*3] ; 3*stride
+ dec r2d ; alpha-1
+ jl .end
+ neg r4
+ dec r3d ; beta-1
+ jl .end
+ add r4, r0 ; pix-4*stride
+ mova p1, [r4+2*r1]
+ mova p0, [r4+r5]
+ mova q0, [r0]
+ mova q1, [r0+r1]
+%ifdef ARCH_X86_64
+ pxor mpb_00, mpb_00
+ mova mpb_01, [pb_01 GLOBAL]
+ LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
+ SWAP 7, 12 ; m12=mask0
+ pavgb t5, mpb_00
+ pavgb t5, mpb_01 ; alpha/4+1
+ movdqa p2, [r4+r1]
+ movdqa q2, [r0+2*r1]
+ DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
+ DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
+ DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
+ pand t0, mask0
+ pand t4, t0
+ pand t2, t0
+ mova mask1q, t4
+ mova mask1p, t2
+%else
+ LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
+ mova m4, t5
+ mova mask0, m7
+ pavgb m4, [pb_00 GLOBAL]
+ pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
+ DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
+ pand m6, mask0
+ DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
+ pand m4, m6
+ mova mask1p, m4
+ DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
+ pand m4, m6
+ mova mask1q, m4
+%endif
+ LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
+ LUMA_INTRA_SWAP_PQ
+ LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
+.end:
+%ifndef ARCH_X86_64
+ add esp, 0x60
+%endif
+ RET
+
+INIT_MMX
+%ifdef ARCH_X86_64
+;-----------------------------------------------------------------------------
+; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
+;-----------------------------------------------------------------------------
+cglobal x264_deblock_h_luma_intra_%1
+ movsxd r10, r1d
+ lea r11, [r10*3]
+ lea rax, [r0-4]
+ lea r9, [r0-4+r11]
+ sub rsp, 0x88
+ %define pix_tmp rsp
+
+ ; transpose 8x16 -> tmp space
+ TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+ lea rax, [rax+r10*8]
+ lea r9, [r9+r10*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+
+ lea r0, [pix_tmp+0x40]
+ mov r1, 0x10
+ call x264_deblock_v_luma_intra_%1
+
+ ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
+ lea r9, [rax+r11]
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
+ shl r10, 3
+ sub rax, r10
+ sub r9, r10
+ shr r10, 3
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
+ add rsp, 0x88
+ ret
+%else
+cglobal x264_deblock_h_luma_intra_%1, 2,4
+ lea r3, [r1*3]
+ sub r0, 4
+ lea r2, [r0+r3]
+%assign pad 0x8c-(stack_offset&15)
+ SUB rsp, pad
+ %define pix_tmp rsp
+
+ ; transpose 8x16 -> tmp space
+ TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+ lea r0, [r0+r1*8]
+ lea r2, [r2+r1*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+
+ lea r0, [pix_tmp+0x40]
+ PUSH dword r3m
+ PUSH dword r2m
+ PUSH dword 16
+ PUSH r0
+ call x264_deblock_%2_luma_intra_%1
+%ifidn %2, v8
+ add dword [rsp], 8 ; pix_tmp+8
+ call x264_deblock_%2_luma_intra_%1
+%endif
+ ADD esp, 16
+
+ mov r1, r1m
+ mov r0, r0m
+ lea r3, [r1*3]
+ sub r0, 4
+ lea r2, [r0+r3]
+ ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
+ lea r0, [r0+r1*8]
+ lea r2, [r2+r1*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
+ ADD rsp, pad
+ RET
+%endif ; ARCH_X86_64
+%endmacro ; DEBLOCK_LUMA_INTRA
+
+INIT_XMM
+DEBLOCK_LUMA_INTRA sse2, v
+%ifndef ARCH_X86_64
+INIT_MMX
+DEBLOCK_LUMA_INTRA mmxext, v8
+%endif
diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
new file mode 100644
index 0000000000..909c27490d
--- /dev/null
+++ b/libavcodec/x86/h264_i386.h
@@ -0,0 +1,155 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file h264_i386.h
+ * H.264 / AVC / MPEG4 part10 codec.
+ * non-MMX i386-specific optimizations for H.264
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVCODEC_X86_H264_I386_H
+#define AVCODEC_X86_H264_I386_H
+
+#include "libavcodec/cabac.h"
+
+//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
+//as that would make optimization work hard)
+#if defined(ARCH_X86) && defined(HAVE_7REGS) && \
+ defined(HAVE_EBX_AVAILABLE) && \
+ !defined(BROKEN_RELOCATIONS)
+static int decode_significance_x86(CABACContext *c, int max_coeff,
+ uint8_t *significant_coeff_ctx_base,
+ int *index){
+ void *end= significant_coeff_ctx_base + max_coeff - 1;
+ int minusstart= -(int)significant_coeff_ctx_base;
+ int minusindex= 4-(int)index;
+ int coeff_count;
+ __asm__ volatile(
+ "movl "RANGE "(%3), %%esi \n\t"
+ "movl "LOW "(%3), %%ebx \n\t"
+
+ "2: \n\t"
+
+ BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx",
+ "%%bx", "%%esi", "%%eax", "%%al")
+
+ "test $1, %%edx \n\t"
+ " jz 3f \n\t"
+
+ BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx",
+ "%%bx", "%%esi", "%%eax", "%%al")
+
+ "mov %2, %%"REG_a" \n\t"
+ "movl %4, %%ecx \n\t"
+ "add %1, %%"REG_c" \n\t"
+ "movl %%ecx, (%%"REG_a") \n\t"
+
+ "test $1, %%edx \n\t"
+ " jnz 4f \n\t"
+
+ "add $4, %%"REG_a" \n\t"
+ "mov %%"REG_a", %2 \n\t"
+
+ "3: \n\t"
+ "add $1, %1 \n\t"
+ "cmp %5, %1 \n\t"
+ " jb 2b \n\t"
+ "mov %2, %%"REG_a" \n\t"
+ "movl %4, %%ecx \n\t"
+ "add %1, %%"REG_c" \n\t"
+ "movl %%ecx, (%%"REG_a") \n\t"
+ "4: \n\t"
+ "add %6, %%eax \n\t"
+ "shr $2, %%eax \n\t"
+
+ "movl %%esi, "RANGE "(%3) \n\t"
+ "movl %%ebx, "LOW "(%3) \n\t"
+ :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)
+ :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)
+ : "%"REG_c, "%ebx", "%edx", "%esi", "memory"
+ );
+ return coeff_count;
+}
+
+static int decode_significance_8x8_x86(CABACContext *c,
+ uint8_t *significant_coeff_ctx_base,
+ int *index, const uint8_t *sig_off){
+ int minusindex= 4-(int)index;
+ int coeff_count;
+ x86_reg last=0;
+ __asm__ volatile(
+ "movl "RANGE "(%3), %%esi \n\t"
+ "movl "LOW "(%3), %%ebx \n\t"
+
+ "mov %1, %%"REG_D" \n\t"
+ "2: \n\t"
+
+ "mov %6, %%"REG_a" \n\t"
+ "movzbl (%%"REG_a", %%"REG_D"), %%edi \n\t"
+ "add %5, %%"REG_D" \n\t"
+
+ BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx",
+ "%%bx", "%%esi", "%%eax", "%%al")
+
+ "mov %1, %%edi \n\t"
+ "test $1, %%edx \n\t"
+ " jz 3f \n\t"
+
+ "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t"
+ "add %5, %%"REG_D" \n\t"
+
+ BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx",
+ "%%bx", "%%esi", "%%eax", "%%al")
+
+ "mov %2, %%"REG_a" \n\t"
+ "mov %1, %%edi \n\t"
+ "movl %%edi, (%%"REG_a") \n\t"
+
+ "test $1, %%edx \n\t"
+ " jnz 4f \n\t"
+
+ "add $4, %%"REG_a" \n\t"
+ "mov %%"REG_a", %2 \n\t"
+
+ "3: \n\t"
+ "addl $1, %%edi \n\t"
+ "mov %%edi, %1 \n\t"
+ "cmpl $63, %%edi \n\t"
+ " jb 2b \n\t"
+ "mov %2, %%"REG_a" \n\t"
+ "movl %%edi, (%%"REG_a") \n\t"
+ "4: \n\t"
+ "addl %4, %%eax \n\t"
+ "shr $2, %%eax \n\t"
+
+ "movl %%esi, "RANGE "(%3) \n\t"
+ "movl %%ebx, "LOW "(%3) \n\t"
+ :"=&a"(coeff_count),"+m"(last), "+m"(index)
+ :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)
+ : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"
+ );
+ return coeff_count;
+}
+#endif /* defined(ARCH_X86) && defined(HAVE_7REGS) && */
+ /* defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS) */
+
+#endif /* AVCODEC_X86_H264_I386_H */
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
new file mode 100644
index 0000000000..7d19f995ec
--- /dev/null
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -0,0 +1,2208 @@
+/*
+ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dsputil_mmx.h"
+
+DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
+DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
+
+/***********************************/
+/* IDCT */
+
+#define SUMSUB_BADC( a, b, c, d ) \
+ "paddw "#b", "#a" \n\t"\
+ "paddw "#d", "#c" \n\t"\
+ "paddw "#b", "#b" \n\t"\
+ "paddw "#d", "#d" \n\t"\
+ "psubw "#a", "#b" \n\t"\
+ "psubw "#c", "#d" \n\t"
+
+#define SUMSUBD2_AB( a, b, t ) \
+ "movq "#b", "#t" \n\t"\
+ "psraw $1 , "#b" \n\t"\
+ "paddw "#a", "#b" \n\t"\
+ "psraw $1 , "#a" \n\t"\
+ "psubw "#t", "#a" \n\t"
+
+#define IDCT4_1D( s02, s13, d02, d13, t ) \
+ SUMSUB_BA ( s02, d02 )\
+ SUMSUBD2_AB( s13, d13, t )\
+ SUMSUB_BADC( d13, s02, s13, d02 )
+
+#define STORE_DIFF_4P( p, t, z ) \
+ "psraw $6, "#p" \n\t"\
+ "movd (%0), "#t" \n\t"\
+ "punpcklbw "#z", "#t" \n\t"\
+ "paddsw "#t", "#p" \n\t"\
+ "packuswb "#z", "#p" \n\t"\
+ "movd "#p", (%0) \n\t"
+
+static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+ /* Load dct coeffs */
+ __asm__ volatile(
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm1 \n\t"
+ "movq 16(%0), %%mm2 \n\t"
+ "movq 24(%0), %%mm3 \n\t"
+ :: "r"(block) );
+
+ __asm__ volatile(
+ /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
+ IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
+
+ "movq %0, %%mm6 \n\t"
+ /* in: 1,4,0,2 out: 1,2,3,0 */
+ TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
+
+ "paddw %%mm6, %%mm3 \n\t"
+
+ /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
+ IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
+
+ "pxor %%mm7, %%mm7 \n\t"
+ :: "m"(ff_pw_32));
+
+ __asm__ volatile(
+ STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
+ "add %1, %0 \n\t"
+ STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
+ "add %1, %0 \n\t"
+ STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
+ "add %1, %0 \n\t"
+ STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
+ : "+r"(dst)
+ : "r" ((x86_reg)stride)
+ );
+}
+
+static inline void h264_idct8_1d(int16_t *block)
+{
+ __asm__ volatile(
+ "movq 112(%0), %%mm7 \n\t"
+ "movq 80(%0), %%mm0 \n\t"
+ "movq 48(%0), %%mm3 \n\t"
+ "movq 16(%0), %%mm5 \n\t"
+
+ "movq %%mm0, %%mm4 \n\t"
+ "movq %%mm5, %%mm1 \n\t"
+ "psraw $1, %%mm4 \n\t"
+ "psraw $1, %%mm1 \n\t"
+ "paddw %%mm0, %%mm4 \n\t"
+ "paddw %%mm5, %%mm1 \n\t"
+ "paddw %%mm7, %%mm4 \n\t"
+ "paddw %%mm0, %%mm1 \n\t"
+ "psubw %%mm5, %%mm4 \n\t"
+ "paddw %%mm3, %%mm1 \n\t"
+
+ "psubw %%mm3, %%mm5 \n\t"
+ "psubw %%mm3, %%mm0 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+ "psraw $1, %%mm3 \n\t"
+ "psraw $1, %%mm7 \n\t"
+ "psubw %%mm3, %%mm5 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+
+ "movq %%mm4, %%mm3 \n\t"
+ "movq %%mm1, %%mm7 \n\t"
+ "psraw $2, %%mm1 \n\t"
+ "psraw $2, %%mm3 \n\t"
+ "paddw %%mm5, %%mm3 \n\t"
+ "psraw $2, %%mm5 \n\t"
+ "paddw %%mm0, %%mm1 \n\t"
+ "psraw $2, %%mm0 \n\t"
+ "psubw %%mm4, %%mm5 \n\t"
+ "psubw %%mm0, %%mm7 \n\t"
+
+ "movq 32(%0), %%mm2 \n\t"
+ "movq 96(%0), %%mm6 \n\t"
+ "movq %%mm2, %%mm4 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "psraw $1, %%mm4 \n\t"
+ "psraw $1, %%mm6 \n\t"
+ "psubw %%mm0, %%mm4 \n\t"
+ "paddw %%mm2, %%mm6 \n\t"
+
+ "movq (%0), %%mm2 \n\t"
+ "movq 64(%0), %%mm0 \n\t"
+ SUMSUB_BA( %%mm0, %%mm2 )
+ SUMSUB_BA( %%mm6, %%mm0 )
+ SUMSUB_BA( %%mm4, %%mm2 )
+ SUMSUB_BA( %%mm7, %%mm6 )
+ SUMSUB_BA( %%mm5, %%mm4 )
+ SUMSUB_BA( %%mm3, %%mm2 )
+ SUMSUB_BA( %%mm1, %%mm0 )
+ :: "r"(block)
+ );
+}
+
+static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+ int i;
+ int16_t __attribute__ ((aligned(8))) b2[64];
+
+ block[0] += 32;
+
+ for(i=0; i<2; i++){
+ DECLARE_ALIGNED_8(uint64_t, tmp);
+
+ h264_idct8_1d(block+4*i);
+
+ __asm__ volatile(
+ "movq %%mm7, %0 \n\t"
+ TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
+ "movq %%mm0, 8(%1) \n\t"
+ "movq %%mm6, 24(%1) \n\t"
+ "movq %%mm7, 40(%1) \n\t"
+ "movq %%mm4, 56(%1) \n\t"
+ "movq %0, %%mm7 \n\t"
+ TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
+ "movq %%mm7, (%1) \n\t"
+ "movq %%mm1, 16(%1) \n\t"
+ "movq %%mm0, 32(%1) \n\t"
+ "movq %%mm3, 48(%1) \n\t"
+ : "=m"(tmp)
+ : "r"(b2+32*i)
+ : "memory"
+ );
+ }
+
+ for(i=0; i<2; i++){
+ h264_idct8_1d(b2+4*i);
+
+ __asm__ volatile(
+ "psraw $6, %%mm7 \n\t"
+ "psraw $6, %%mm6 \n\t"
+ "psraw $6, %%mm5 \n\t"
+ "psraw $6, %%mm4 \n\t"
+ "psraw $6, %%mm3 \n\t"
+ "psraw $6, %%mm2 \n\t"
+ "psraw $6, %%mm1 \n\t"
+ "psraw $6, %%mm0 \n\t"
+
+ "movq %%mm7, (%0) \n\t"
+ "movq %%mm5, 16(%0) \n\t"
+ "movq %%mm3, 32(%0) \n\t"
+ "movq %%mm1, 48(%0) \n\t"
+ "movq %%mm0, 64(%0) \n\t"
+ "movq %%mm2, 80(%0) \n\t"
+ "movq %%mm4, 96(%0) \n\t"
+ "movq %%mm6, 112(%0) \n\t"
+ :: "r"(b2+4*i)
+ : "memory"
+ );
+ }
+
+ add_pixels_clamped_mmx(b2, dst, stride);
+}
+
+#define STORE_DIFF_8P( p, d, t, z )\
+ "movq "#d", "#t" \n"\
+ "psraw $6, "#p" \n"\
+ "punpcklbw "#z", "#t" \n"\
+ "paddsw "#t", "#p" \n"\
+ "packuswb "#p", "#p" \n"\
+ "movq "#p", "#d" \n"
+
+#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
+ "movdqa "#c", "#a" \n"\
+ "movdqa "#g", "#e" \n"\
+ "psraw $1, "#c" \n"\
+ "psraw $1, "#g" \n"\
+ "psubw "#e", "#c" \n"\
+ "paddw "#a", "#g" \n"\
+ "movdqa "#b", "#e" \n"\
+ "psraw $1, "#e" \n"\
+ "paddw "#b", "#e" \n"\
+ "paddw "#d", "#e" \n"\
+ "paddw "#f", "#e" \n"\
+ "movdqa "#f", "#a" \n"\
+ "psraw $1, "#a" \n"\
+ "paddw "#f", "#a" \n"\
+ "paddw "#h", "#a" \n"\
+ "psubw "#b", "#a" \n"\
+ "psubw "#d", "#b" \n"\
+ "psubw "#d", "#f" \n"\
+ "paddw "#h", "#b" \n"\
+ "psubw "#h", "#f" \n"\
+ "psraw $1, "#d" \n"\
+ "psraw $1, "#h" \n"\
+ "psubw "#d", "#b" \n"\
+ "psubw "#h", "#f" \n"\
+ "movdqa "#e", "#d" \n"\
+ "movdqa "#a", "#h" \n"\
+ "psraw $2, "#d" \n"\
+ "psraw $2, "#h" \n"\
+ "paddw "#f", "#d" \n"\
+ "paddw "#b", "#h" \n"\
+ "psraw $2, "#f" \n"\
+ "psraw $2, "#b" \n"\
+ "psubw "#f", "#e" \n"\
+ "psubw "#a", "#b" \n"\
+ "movdqa 0x00(%1), "#a" \n"\
+ "movdqa 0x40(%1), "#f" \n"\
+ SUMSUB_BA(f, a)\
+ SUMSUB_BA(g, f)\
+ SUMSUB_BA(c, a)\
+ SUMSUB_BA(e, g)\
+ SUMSUB_BA(b, c)\
+ SUMSUB_BA(h, a)\
+ SUMSUB_BA(d, f)
+
+static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
+{
+ __asm__ volatile(
+ "movdqa 0x10(%1), %%xmm1 \n"
+ "movdqa 0x20(%1), %%xmm2 \n"
+ "movdqa 0x30(%1), %%xmm3 \n"
+ "movdqa 0x50(%1), %%xmm5 \n"
+ "movdqa 0x60(%1), %%xmm6 \n"
+ "movdqa 0x70(%1), %%xmm7 \n"
+ H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
+ TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
+ "paddw %4, %%xmm4 \n"
+ "movdqa %%xmm4, 0x00(%1) \n"
+ "movdqa %%xmm2, 0x40(%1) \n"
+ H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
+ "movdqa %%xmm6, 0x60(%1) \n"
+ "movdqa %%xmm7, 0x70(%1) \n"
+ "pxor %%xmm7, %%xmm7 \n"
+ STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7)
+ STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7)
+ STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
+ STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7)
+ "lea (%0,%2,4), %0 \n"
+ STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7)
+ STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7)
+ "movdqa 0x60(%1), %%xmm0 \n"
+ "movdqa 0x70(%1), %%xmm1 \n"
+ STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
+ STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7)
+ :"+r"(dst)
+ :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
+ );
+}
+
+static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+ int dc = (block[0] + 32) >> 6;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dst+0*stride)),
+ "+m"(*(uint32_t*)(dst+1*stride)),
+ "+m"(*(uint32_t*)(dst+2*stride)),
+ "+m"(*(uint32_t*)(dst+3*stride))
+ );
+}
+
+static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+ int dc = (block[0] + 32) >> 6;
+ int y;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ for(y=2; y--; dst += 4*stride){
+ __asm__ volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint64_t*)(dst+0*stride)),
+ "+m"(*(uint64_t*)(dst+1*stride)),
+ "+m"(*(uint64_t*)(dst+2*stride)),
+ "+m"(*(uint64_t*)(dst+3*stride))
+ );
+ }
+}
+
+//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
+static const uint8_t scan8[16 + 2*4]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+};
+
+static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i++){
+ if(nnzc[ scan8[i] ])
+ ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
+ }
+}
+
+static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i+=4){
+ if(nnzc[ scan8[i] ])
+ ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
+ }
+}
+
+
+static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i++){
+ int nnz = nnzc[ scan8[i] ];
+ if(nnz){
+ if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+ else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
+ }
+ }
+}
+
+static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i++){
+ if(nnzc[ scan8[i] ] || block[i*16])
+ ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
+ }
+}
+
+static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i++){
+ if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
+ else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+ }
+}
+
+static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i+=4){
+ int nnz = nnzc[ scan8[i] ];
+ if(nnz){
+ if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+ else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride);
+ }
+ }
+}
+
+static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i+=4){
+ int nnz = nnzc[ scan8[i] ];
+ if(nnz){
+ if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+ else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride);
+ }
+ }
+}
+
+static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=16; i<16+8; i++){
+ if(nnzc[ scan8[i] ] || block[i*16])
+ ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+ }
+}
+
+static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=16; i<16+8; i++){
+ if(nnzc[ scan8[i] ])
+ ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+ else if(block[i*16])
+ ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+ }
+}
+
+/***********************************/
+/* deblocking */
+
+// out: o = |x-y|>a
+// clobbers: t
+#define DIFF_GT_MMX(x,y,a,o,t)\
+ "movq "#y", "#t" \n\t"\
+ "movq "#x", "#o" \n\t"\
+ "psubusb "#x", "#t" \n\t"\
+ "psubusb "#y", "#o" \n\t"\
+ "por "#t", "#o" \n\t"\
+ "psubusb "#a", "#o" \n\t"
+
+// out: o = |x-y|>a
+// clobbers: t
+#define DIFF_GT2_MMX(x,y,a,o,t)\
+ "movq "#y", "#t" \n\t"\
+ "movq "#x", "#o" \n\t"\
+ "psubusb "#x", "#t" \n\t"\
+ "psubusb "#y", "#o" \n\t"\
+ "psubusb "#a", "#t" \n\t"\
+ "psubusb "#a", "#o" \n\t"\
+ "pcmpeqb "#t", "#o" \n\t"\
+
+// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
+// out: mm5=beta-1, mm7=mask
+// clobbers: mm4,mm6
+#define H264_DEBLOCK_MASK(alpha1, beta1) \
+ "pshufw $0, "#alpha1", %%mm4 \n\t"\
+ "pshufw $0, "#beta1 ", %%mm5 \n\t"\
+ "packuswb %%mm4, %%mm4 \n\t"\
+ "packuswb %%mm5, %%mm5 \n\t"\
+ DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
+ DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
+ "por %%mm4, %%mm7 \n\t"\
+ DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
+ "por %%mm4, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "pcmpeqb %%mm6, %%mm7 \n\t"
+
+// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
+// out: mm1=p0' mm2=q0'
+// clobbers: mm0,3-6
+#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
+ "movq %%mm1 , %%mm5 \n\t"\
+ "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\
+ "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\
+ "pcmpeqb %%mm4 , %%mm4 \n\t"\
+ "pxor %%mm4 , %%mm3 \n\t"\
+ "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
+ "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
+ "pxor %%mm1 , %%mm4 \n\t"\
+ "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
+ "pavgb %%mm5 , %%mm3 \n\t"\
+ "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\
+ "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
+ "psubusb %%mm3 , %%mm6 \n\t"\
+ "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
+ "pminub %%mm7 , %%mm6 \n\t"\
+ "pminub %%mm7 , %%mm3 \n\t"\
+ "psubusb %%mm6 , %%mm1 \n\t"\
+ "psubusb %%mm3 , %%mm2 \n\t"\
+ "paddusb %%mm3 , %%mm1 \n\t"\
+ "paddusb %%mm6 , %%mm2 \n\t"
+
+// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
+// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
+// clobbers: q2, tmp, tc0
+#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
+ "movq %%mm1, "#tmp" \n\t"\
+ "pavgb %%mm2, "#tmp" \n\t"\
+ "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\
+ "pxor "q2addr", "#tmp" \n\t"\
+ "pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\
+ "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
+ "movq "#p1", "#tmp" \n\t"\
+ "psubusb "#tc0", "#tmp" \n\t"\
+ "paddusb "#p1", "#tc0" \n\t"\
+ "pmaxub "#tmp", "#q2" \n\t"\
+ "pminub "#tc0", "#q2" \n\t"\
+ "movq "#q2", "q1addr" \n\t"
+
+static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
+{
+ DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
+
+ __asm__ volatile(
+ "movq (%1,%3), %%mm0 \n\t" //p1
+ "movq (%1,%3,2), %%mm1 \n\t" //p0
+ "movq (%2), %%mm2 \n\t" //q0
+ "movq (%2,%3), %%mm3 \n\t" //q1
+ H264_DEBLOCK_MASK(%6, %7)
+
+ "movd %5, %%mm4 \n\t"
+ "punpcklbw %%mm4, %%mm4 \n\t"
+ "punpcklwd %%mm4, %%mm4 \n\t"
+ "pcmpeqb %%mm3, %%mm3 \n\t"
+ "movq %%mm4, %%mm6 \n\t"
+ "pcmpgtb %%mm3, %%mm4 \n\t"
+ "movq %%mm6, 8+%0 \n\t"
+ "pand %%mm4, %%mm7 \n\t"
+ "movq %%mm7, %0 \n\t"
+
+ /* filter p1 */
+ "movq (%1), %%mm3 \n\t" //p2
+ DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
+ "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta
+ "pand 8+%0, %%mm7 \n\t" // mask & tc0
+ "movq %%mm7, %%mm4 \n\t"
+ "psubb %%mm6, %%mm7 \n\t"
+ "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0
+ H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
+
+ /* filter q1 */
+ "movq (%2,%3,2), %%mm4 \n\t" //q2
+ DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
+ "pand %0, %%mm6 \n\t"
+ "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then
+ "pand %%mm6, %%mm5 \n\t"
+ "psubb %%mm6, %%mm7 \n\t"
+ "movq (%2,%3), %%mm3 \n\t"
+ H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
+
+ /* filter p0, q0 */
+ H264_DEBLOCK_P0_Q0(%8, unused)
+ "movq %%mm1, (%1,%3,2) \n\t"
+ "movq %%mm2, (%2) \n\t"
+
+ : "=m"(*tmp0)
+ : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
+ "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
+ "m"(ff_bone)
+ );
+}
+
+static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+ if((tc0[0] & tc0[1]) >= 0)
+ h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
+ if((tc0[2] & tc0[3]) >= 0)
+ h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
+}
+static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+ //FIXME: could cut some load/stores by merging transpose with filter
+ // also, it only needs to transpose 6x8
+ DECLARE_ALIGNED_8(uint8_t, trans[8*8]);
+ int i;
+ for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
+ if((tc0[0] & tc0[1]) < 0)
+ continue;
+ transpose4x4(trans, pix-4, 8, stride);
+ transpose4x4(trans +4*8, pix, 8, stride);
+ transpose4x4(trans+4, pix-4+4*stride, 8, stride);
+ transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
+ h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
+ transpose4x4(pix-2, trans +2*8, stride, 8);
+ transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
+ }
+}
+
+static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
+{
+ __asm__ volatile(
+ "movq (%0), %%mm0 \n\t" //p1
+ "movq (%0,%2), %%mm1 \n\t" //p0
+ "movq (%1), %%mm2 \n\t" //q0
+ "movq (%1,%2), %%mm3 \n\t" //q1
+ H264_DEBLOCK_MASK(%4, %5)
+ "movd %3, %%mm6 \n\t"
+ "punpcklbw %%mm6, %%mm6 \n\t"
+ "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask
+ H264_DEBLOCK_P0_Q0(%6, %7)
+ "movq %%mm1, (%0,%2) \n\t"
+ "movq %%mm2, (%1) \n\t"
+
+ :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
+ "r"(*(uint32_t*)tc0),
+ "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
+ );
+}
+
+static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+ h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
+}
+
+static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
+{
+ //FIXME: could cut some load/stores by merging transpose with filter
+ DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
+ transpose4x4(trans, pix-2, 8, stride);
+ transpose4x4(trans+4, pix-2+4*stride, 8, stride);
+ h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
+ transpose4x4(pix-2, trans, stride, 8);
+ transpose4x4(pix-2+4*stride, trans+4, stride, 8);
+}
+
+// p0 = (p0 + q1 + 2*p1 + 2) >> 2
+#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
+ "movq "#p0", %%mm4 \n\t"\
+ "pxor "#q1", %%mm4 \n\t"\
+ "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\
+ "pavgb "#q1", "#p0" \n\t"\
+ "psubusb %%mm4, "#p0" \n\t"\
+ "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
+
+static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
+{
+ __asm__ volatile(
+ "movq (%0), %%mm0 \n\t"
+ "movq (%0,%2), %%mm1 \n\t"
+ "movq (%1), %%mm2 \n\t"
+ "movq (%1,%2), %%mm3 \n\t"
+ H264_DEBLOCK_MASK(%3, %4)
+ "movq %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %%mm6 \n\t"
+ H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
+ H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
+ "psubb %%mm5, %%mm1 \n\t"
+ "psubb %%mm6, %%mm2 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+ "paddb %%mm5, %%mm1 \n\t"
+ "paddb %%mm6, %%mm2 \n\t"
+ "movq %%mm1, (%0,%2) \n\t"
+ "movq %%mm2, (%1) \n\t"
+ :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
+ "m"(alpha1), "m"(beta1), "m"(ff_bone)
+ );
+}
+
+static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
+{
+ h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
+}
+
+static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
+{
+ //FIXME: could cut some load/stores by merging transpose with filter
+ DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
+ transpose4x4(trans, pix-2, 8, stride);
+ transpose4x4(trans+4, pix-2+4*stride, 8, stride);
+ h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
+ transpose4x4(pix-2, trans, stride, 8);
+ transpose4x4(pix-2+4*stride, trans+4, stride, 8);
+}
+
+static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
+ int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
+ int dir;
+ __asm__ volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "movq %0, %%mm6 \n\t"
+ "movq %1, %%mm5 \n\t"
+ "movq %2, %%mm4 \n\t"
+ ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
+ );
+ if(field)
+ __asm__ volatile(
+ "movq %0, %%mm5 \n\t"
+ "movq %1, %%mm4 \n\t"
+ ::"m"(ff_pb_3_1), "m"(ff_pb_7_3)
+ );
+
+ // could do a special case for dir==0 && edges==1, but it only reduces the
+ // average filter time by 1.2%
+ for( dir=1; dir>=0; dir-- ) {
+ const int d_idx = dir ? -8 : -1;
+ const int mask_mv = dir ? mask_mv1 : mask_mv0;
+ DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
+ int b_idx, edge, l;
+ for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
+ __asm__ volatile(
+ "pand %0, %%mm0 \n\t"
+ ::"m"(mask_dir)
+ );
+ if(!(mask_mv & edge)) {
+ __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
+ for( l = bidir; l >= 0; l-- ) {
+ __asm__ volatile(
+ "movd %0, %%mm1 \n\t"
+ "punpckldq %1, %%mm1 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ "psrlw $7, %%mm2 \n\t"
+ "pand %%mm6, %%mm2 \n\t"
+ "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1
+ "punpckldq %%mm1, %%mm2 \n\t"
+ "pcmpeqb %%mm2, %%mm1 \n\t"
+ "paddb %%mm6, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
+ "por %%mm1, %%mm0 \n\t"
+
+ "movq %2, %%mm1 \n\t"
+ "movq %3, %%mm2 \n\t"
+ "psubw %4, %%mm1 \n\t"
+ "psubw %5, %%mm2 \n\t"
+ "packsswb %%mm2, %%mm1 \n\t"
+ "paddb %%mm5, %%mm1 \n\t"
+ "pminub %%mm4, %%mm1 \n\t"
+ "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
+ "por %%mm1, %%mm0 \n\t"
+ ::"m"(ref[l][b_idx]),
+ "m"(ref[l][b_idx+d_idx]),
+ "m"(mv[l][b_idx][0]),
+ "m"(mv[l][b_idx+2][0]),
+ "m"(mv[l][b_idx+d_idx][0]),
+ "m"(mv[l][b_idx+d_idx+2][0])
+ );
+ }
+ }
+ __asm__ volatile(
+ "movd %0, %%mm1 \n\t"
+ "por %1, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn]
+ ::"m"(nnz[b_idx]),
+ "m"(nnz[b_idx+d_idx])
+ );
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm0 \n\t"
+ "pcmpeqw %%mm7, %%mm0 \n\t"
+ "psrlw $15, %%mm0 \n\t" // nonzero -> 1
+ "psrlw $14, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "por %%mm1, %%mm2 \n\t"
+ "psrlw $1, %%mm1 \n\t"
+ "pandn %%mm2, %%mm1 \n\t"
+ "movq %%mm1, %0 \n\t"
+ :"=m"(*bS[dir][edge])
+ ::"memory"
+ );
+ }
+ edges = 4;
+ step = 1;
+ }
+ __asm__ volatile(
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm1 \n\t"
+ "movq 16(%0), %%mm2 \n\t"
+ "movq 24(%0), %%mm3 \n\t"
+ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm3, 8(%0) \n\t"
+ "movq %%mm4, 16(%0) \n\t"
+ "movq %%mm2, 24(%0) \n\t"
+ ::"r"(bS[0])
+ :"memory"
+ );
+}
+
+/***********************************/
+/* motion compensation */
+
+#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
+ "mov"#q" "#C", "#T" \n\t"\
+ "mov"#d" (%0), "#F" \n\t"\
+ "paddw "#D", "#T" \n\t"\
+ "psllw $2, "#T" \n\t"\
+ "psubw "#B", "#T" \n\t"\
+ "psubw "#E", "#T" \n\t"\
+ "punpcklbw "#Z", "#F" \n\t"\
+ "pmullw %4, "#T" \n\t"\
+ "paddw %5, "#A" \n\t"\
+ "add %2, %0 \n\t"\
+ "paddw "#F", "#A" \n\t"\
+ "paddw "#A", "#T" \n\t"\
+ "psraw $5, "#T" \n\t"\
+ "packuswb "#T", "#T" \n\t"\
+ OP(T, (%1), A, d)\
+ "add %3, %1 \n\t"
+
+#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
+ "mov"#q" "#C", "#T" \n\t"\
+ "mov"#d" (%0), "#F" \n\t"\
+ "paddw "#D", "#T" \n\t"\
+ "psllw $2, "#T" \n\t"\
+ "paddw %4, "#A" \n\t"\
+ "psubw "#B", "#T" \n\t"\
+ "psubw "#E", "#T" \n\t"\
+ "punpcklbw "#Z", "#F" \n\t"\
+ "pmullw %3, "#T" \n\t"\
+ "paddw "#F", "#A" \n\t"\
+ "add %2, %0 \n\t"\
+ "paddw "#A", "#T" \n\t"\
+ "mov"#q" "#T", "#OF"(%1) \n\t"
+
+#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
+#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
+#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
+#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
+
+
+#define QPEL_H264(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ int h=4;\
+\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %5, %%mm4 \n\t"\
+ "movq %6, %%mm5 \n\t"\
+ "1: \n\t"\
+ "movd -1(%0), %%mm1 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "movd 1(%0), %%mm3 \n\t"\
+ "movd 2(%0), %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "paddw %%mm0, %%mm1 \n\t"\
+ "paddw %%mm3, %%mm2 \n\t"\
+ "movd -2(%0), %%mm0 \n\t"\
+ "movd 3(%0), %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "paddw %%mm3, %%mm0 \n\t"\
+ "psllw $2, %%mm2 \n\t"\
+ "psubw %%mm1, %%mm2 \n\t"\
+ "pmullw %%mm4, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm0 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "packuswb %%mm0, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm6, d)\
+ "add %3, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+g"(h)\
+ : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+}\
+static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ int h=4;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %0, %%mm4 \n\t"\
+ "movq %1, %%mm5 \n\t"\
+ :: "m"(ff_pw_5), "m"(ff_pw_16)\
+ );\
+ do{\
+ __asm__ volatile(\
+ "movd -1(%0), %%mm1 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "movd 1(%0), %%mm3 \n\t"\
+ "movd 2(%0), %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "paddw %%mm0, %%mm1 \n\t"\
+ "paddw %%mm3, %%mm2 \n\t"\
+ "movd -2(%0), %%mm0 \n\t"\
+ "movd 3(%0), %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "paddw %%mm3, %%mm0 \n\t"\
+ "psllw $2, %%mm2 \n\t"\
+ "psubw %%mm1, %%mm2 \n\t"\
+ "pmullw %%mm4, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm0 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "movd (%2), %%mm3 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "packuswb %%mm0, %%mm0 \n\t"\
+ PAVGB" %%mm3, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm6, d)\
+ "add %4, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "add %3, %2 \n\t"\
+ : "+a"(src), "+c"(dst), "+d"(src2)\
+ : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+ }while(--h);\
+}\
+static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ src -= 2*srcStride;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd (%0), %%mm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+}\
+static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ int h=4;\
+ int w=3;\
+ src -= 2*srcStride+2;\
+ while(w--){\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd (%0), %%mm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
+ \
+ : "+a"(src)\
+ : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ tmp += 4;\
+ src += 4 - 9*srcStride;\
+ }\
+ tmp -= 3*4;\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "paddw 10(%0), %%mm0 \n\t"\
+ "movq 2(%0), %%mm1 \n\t"\
+ "paddw 8(%0), %%mm1 \n\t"\
+ "movq 4(%0), %%mm2 \n\t"\
+ "paddw 6(%0), %%mm2 \n\t"\
+ "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
+ "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
+ "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
+ "paddsw %%mm2, %%mm0 \n\t"\
+ "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
+ "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
+ "psraw $6, %%mm0 \n\t"\
+ "packuswb %%mm0, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm7, d)\
+ "add $24, %0 \n\t"\
+ "add %3, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(tmp), "+c"(dst), "+g"(h)\
+ : "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ int h=8;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %5, %%mm6 \n\t"\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 1(%0), %%mm2 \n\t"\
+ "movq %%mm0, %%mm1 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "psllw $2, %%mm0 \n\t"\
+ "psllw $2, %%mm1 \n\t"\
+ "movq -1(%0), %%mm2 \n\t"\
+ "movq 2(%0), %%mm4 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "movq %%mm4, %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm7, %%mm5 \n\t"\
+ "paddw %%mm4, %%mm2 \n\t"\
+ "paddw %%mm3, %%mm5 \n\t"\
+ "psubw %%mm2, %%mm0 \n\t"\
+ "psubw %%mm5, %%mm1 \n\t"\
+ "pmullw %%mm6, %%mm0 \n\t"\
+ "pmullw %%mm6, %%mm1 \n\t"\
+ "movd -2(%0), %%mm2 \n\t"\
+ "movd 7(%0), %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm5 \n\t"\
+ "paddw %%mm3, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ "movq %6, %%mm5 \n\t"\
+ "paddw %%mm5, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm4, %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm5, q)\
+ "add %3, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+g"(h)\
+ : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ int h=8;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %0, %%mm6 \n\t"\
+ :: "m"(ff_pw_5)\
+ );\
+ do{\
+ __asm__ volatile(\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 1(%0), %%mm2 \n\t"\
+ "movq %%mm0, %%mm1 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "psllw $2, %%mm0 \n\t"\
+ "psllw $2, %%mm1 \n\t"\
+ "movq -1(%0), %%mm2 \n\t"\
+ "movq 2(%0), %%mm4 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "movq %%mm4, %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm7, %%mm5 \n\t"\
+ "paddw %%mm4, %%mm2 \n\t"\
+ "paddw %%mm3, %%mm5 \n\t"\
+ "psubw %%mm2, %%mm0 \n\t"\
+ "psubw %%mm5, %%mm1 \n\t"\
+ "pmullw %%mm6, %%mm0 \n\t"\
+ "pmullw %%mm6, %%mm1 \n\t"\
+ "movd -2(%0), %%mm2 \n\t"\
+ "movd 7(%0), %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm5 \n\t"\
+ "paddw %%mm3, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ "movq %5, %%mm5 \n\t"\
+ "paddw %%mm5, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm4, %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "movq (%2), %%mm4 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ PAVGB" %%mm4, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm5, q)\
+ "add %4, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "add %3, %2 \n\t"\
+ : "+a"(src), "+c"(dst), "+d"(src2)\
+ : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+ "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ }while(--h);\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ int w= 2;\
+ src -= 2*srcStride;\
+ \
+ while(w--){\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd (%0), %%mm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+ QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+ QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ if(h==16){\
+ __asm__ volatile(\
+ QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+ QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+ QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ }\
+ src += 4-(h+5)*srcStride;\
+ dst += 4-h*dstStride;\
+ }\
+}\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
+ int w = (size+8)>>2;\
+ src -= 2*srcStride+2;\
+ while(w--){\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd (%0), %%mm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
+ QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
+ QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
+ : "+a"(src)\
+ : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ if(size==16){\
+ __asm__ volatile(\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
+ QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
+ QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
+ : "+a"(src)\
+ : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ }\
+ tmp += 4;\
+ src += 4 - (size+5)*srcStride;\
+ }\
+}\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+ int w = size>>4;\
+ do{\
+ int h = size;\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 8(%0), %%mm3 \n\t"\
+ "movq 2(%0), %%mm1 \n\t"\
+ "movq 10(%0), %%mm4 \n\t"\
+ "paddw %%mm4, %%mm0 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "paddw 18(%0), %%mm3 \n\t"\
+ "paddw 16(%0), %%mm4 \n\t"\
+ "movq 4(%0), %%mm2 \n\t"\
+ "movq 12(%0), %%mm5 \n\t"\
+ "paddw 6(%0), %%mm2 \n\t"\
+ "paddw 14(%0), %%mm5 \n\t"\
+ "psubw %%mm1, %%mm0 \n\t"\
+ "psubw %%mm4, %%mm3 \n\t"\
+ "psraw $2, %%mm0 \n\t"\
+ "psraw $2, %%mm3 \n\t"\
+ "psubw %%mm1, %%mm0 \n\t"\
+ "psubw %%mm4, %%mm3 \n\t"\
+ "paddsw %%mm2, %%mm0 \n\t"\
+ "paddsw %%mm5, %%mm3 \n\t"\
+ "psraw $2, %%mm0 \n\t"\
+ "psraw $2, %%mm3 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm5, %%mm3 \n\t"\
+ "psraw $6, %%mm0 \n\t"\
+ "psraw $6, %%mm3 \n\t"\
+ "packuswb %%mm3, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm7, q)\
+ "add $48, %0 \n\t"\
+ "add %3, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(tmp), "+c"(dst), "+g"(h)\
+ : "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+ tmp += 8 - size*24;\
+ dst += 8 - size*dstStride;\
+ }while(w--);\
+}\
+\
+static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+ src += 8*srcStride;\
+ dst += 8*dstStride;\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+ src += 8*dstStride;\
+ dst += 8*dstStride;\
+ src2 += 8*src2Stride;\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+ put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
+ OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
+}\
+\
+static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
+}\
+\
+static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+ __asm__ volatile(\
+ "movq (%1), %%mm0 \n\t"\
+ "movq 24(%1), %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "packuswb %%mm0, %%mm0 \n\t"\
+ "packuswb %%mm1, %%mm1 \n\t"\
+ PAVGB" (%0), %%mm0 \n\t"\
+ PAVGB" (%0,%3), %%mm1 \n\t"\
+ OP(%%mm0, (%2), %%mm4, d)\
+ OP(%%mm1, (%2,%4), %%mm5, d)\
+ "lea (%0,%3,2), %0 \n\t"\
+ "lea (%2,%4,2), %2 \n\t"\
+ "movq 48(%1), %%mm0 \n\t"\
+ "movq 72(%1), %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "packuswb %%mm0, %%mm0 \n\t"\
+ "packuswb %%mm1, %%mm1 \n\t"\
+ PAVGB" (%0), %%mm0 \n\t"\
+ PAVGB" (%0,%3), %%mm1 \n\t"\
+ OP(%%mm0, (%2), %%mm4, d)\
+ OP(%%mm1, (%2,%4), %%mm5, d)\
+ :"+a"(src8), "+c"(src16), "+d"(dst)\
+ :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
+ :"memory");\
+}\
+static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+ do{\
+ __asm__ volatile(\
+ "movq (%1), %%mm0 \n\t"\
+ "movq 8(%1), %%mm1 \n\t"\
+ "movq 48(%1), %%mm2 \n\t"\
+ "movq 8+48(%1), %%mm3 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "psraw $5, %%mm2 \n\t"\
+ "psraw $5, %%mm3 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ "packuswb %%mm3, %%mm2 \n\t"\
+ PAVGB" (%0), %%mm0 \n\t"\
+ PAVGB" (%0,%3), %%mm2 \n\t"\
+ OP(%%mm0, (%2), %%mm5, q)\
+ OP(%%mm2, (%2,%4), %%mm5, q)\
+ ::"a"(src8), "c"(src16), "d"(dst),\
+ "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
+ :"memory");\
+ src8 += 2L*src8Stride;\
+ src16 += 48;\
+ dst += 2L*dstStride;\
+ }while(h-=2);\
+}\
+static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+ OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
+ OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
+}\
+
+
+#ifdef ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ int h=16;\
+ __asm__ volatile(\
+ "pxor %%xmm15, %%xmm15 \n\t"\
+ "movdqa %6, %%xmm14 \n\t"\
+ "movdqa %7, %%xmm13 \n\t"\
+ "1: \n\t"\
+ "lddqu 3(%0), %%xmm1 \n\t"\
+ "lddqu -5(%0), %%xmm7 \n\t"\
+ "movdqa %%xmm1, %%xmm0 \n\t"\
+ "punpckhbw %%xmm15, %%xmm1 \n\t"\
+ "punpcklbw %%xmm15, %%xmm0 \n\t"\
+ "punpcklbw %%xmm15, %%xmm7 \n\t"\
+ "movdqa %%xmm1, %%xmm2 \n\t"\
+ "movdqa %%xmm0, %%xmm6 \n\t"\
+ "movdqa %%xmm1, %%xmm3 \n\t"\
+ "movdqa %%xmm0, %%xmm8 \n\t"\
+ "movdqa %%xmm1, %%xmm4 \n\t"\
+ "movdqa %%xmm0, %%xmm9 \n\t"\
+ "movdqa %%xmm1, %%xmm5 \n\t"\
+ "movdqa %%xmm0, %%xmm10 \n\t"\
+ "palignr $6, %%xmm0, %%xmm5 \n\t"\
+ "palignr $6, %%xmm7, %%xmm10\n\t"\
+ "palignr $8, %%xmm0, %%xmm4 \n\t"\
+ "palignr $8, %%xmm7, %%xmm9 \n\t"\
+ "palignr $10,%%xmm0, %%xmm3 \n\t"\
+ "palignr $10,%%xmm7, %%xmm8 \n\t"\
+ "paddw %%xmm1, %%xmm5 \n\t"\
+ "paddw %%xmm0, %%xmm10 \n\t"\
+ "palignr $12,%%xmm0, %%xmm2 \n\t"\
+ "palignr $12,%%xmm7, %%xmm6 \n\t"\
+ "palignr $14,%%xmm0, %%xmm1 \n\t"\
+ "palignr $14,%%xmm7, %%xmm0 \n\t"\
+ "paddw %%xmm3, %%xmm2 \n\t"\
+ "paddw %%xmm8, %%xmm6 \n\t"\
+ "paddw %%xmm4, %%xmm1 \n\t"\
+ "paddw %%xmm9, %%xmm0 \n\t"\
+ "psllw $2, %%xmm2 \n\t"\
+ "psllw $2, %%xmm6 \n\t"\
+ "psubw %%xmm1, %%xmm2 \n\t"\
+ "psubw %%xmm0, %%xmm6 \n\t"\
+ "paddw %%xmm13,%%xmm5 \n\t"\
+ "paddw %%xmm13,%%xmm10 \n\t"\
+ "pmullw %%xmm14,%%xmm2 \n\t"\
+ "pmullw %%xmm14,%%xmm6 \n\t"\
+ "lddqu (%2), %%xmm3 \n\t"\
+ "paddw %%xmm5, %%xmm2 \n\t"\
+ "paddw %%xmm10,%%xmm6 \n\t"\
+ "psraw $5, %%xmm2 \n\t"\
+ "psraw $5, %%xmm6 \n\t"\
+ "packuswb %%xmm2,%%xmm6 \n\t"\
+ "pavgb %%xmm3, %%xmm6 \n\t"\
+ OP(%%xmm6, (%1), %%xmm4, dqa)\
+ "add %5, %0 \n\t"\
+ "add %5, %1 \n\t"\
+ "add %4, %2 \n\t"\
+ "decl %3 \n\t"\
+ "jg 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
+ : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+ "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+}
+#else // ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+ src += 8*dstStride;\
+ dst += 8*dstStride;\
+ src2 += 8*src2Stride;\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}
+#endif // ARCH_X86_64
+
+#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ int h=8;\
+ __asm__ volatile(\
+ "pxor %%xmm7, %%xmm7 \n\t"\
+ "movdqa %0, %%xmm6 \n\t"\
+ :: "m"(ff_pw_5)\
+ );\
+ do{\
+ __asm__ volatile(\
+ "lddqu -5(%0), %%xmm1 \n\t"\
+ "movdqa %%xmm1, %%xmm0 \n\t"\
+ "punpckhbw %%xmm7, %%xmm1 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "movdqa %%xmm1, %%xmm2 \n\t"\
+ "movdqa %%xmm1, %%xmm3 \n\t"\
+ "movdqa %%xmm1, %%xmm4 \n\t"\
+ "movdqa %%xmm1, %%xmm5 \n\t"\
+ "palignr $6, %%xmm0, %%xmm5 \n\t"\
+ "palignr $8, %%xmm0, %%xmm4 \n\t"\
+ "palignr $10,%%xmm0, %%xmm3 \n\t"\
+ "paddw %%xmm1, %%xmm5 \n\t"\
+ "palignr $12,%%xmm0, %%xmm2 \n\t"\
+ "palignr $14,%%xmm0, %%xmm1 \n\t"\
+ "paddw %%xmm3, %%xmm2 \n\t"\
+ "paddw %%xmm4, %%xmm1 \n\t"\
+ "psllw $2, %%xmm2 \n\t"\
+ "movq (%2), %%xmm3 \n\t"\
+ "psubw %%xmm1, %%xmm2 \n\t"\
+ "paddw %5, %%xmm5 \n\t"\
+ "pmullw %%xmm6, %%xmm2 \n\t"\
+ "paddw %%xmm5, %%xmm2 \n\t"\
+ "psraw $5, %%xmm2 \n\t"\
+ "packuswb %%xmm2, %%xmm2 \n\t"\
+ "pavgb %%xmm3, %%xmm2 \n\t"\
+ OP(%%xmm2, (%1), %%xmm4, q)\
+ "add %4, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "add %3, %2 \n\t"\
+ : "+a"(src), "+c"(dst), "+d"(src2)\
+ : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+ "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ }while(--h);\
+}\
+QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ int h=8;\
+ __asm__ volatile(\
+ "pxor %%xmm7, %%xmm7 \n\t"\
+ "movdqa %5, %%xmm6 \n\t"\
+ "1: \n\t"\
+ "lddqu -5(%0), %%xmm1 \n\t"\
+ "movdqa %%xmm1, %%xmm0 \n\t"\
+ "punpckhbw %%xmm7, %%xmm1 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "movdqa %%xmm1, %%xmm2 \n\t"\
+ "movdqa %%xmm1, %%xmm3 \n\t"\
+ "movdqa %%xmm1, %%xmm4 \n\t"\
+ "movdqa %%xmm1, %%xmm5 \n\t"\
+ "palignr $6, %%xmm0, %%xmm5 \n\t"\
+ "palignr $8, %%xmm0, %%xmm4 \n\t"\
+ "palignr $10,%%xmm0, %%xmm3 \n\t"\
+ "paddw %%xmm1, %%xmm5 \n\t"\
+ "palignr $12,%%xmm0, %%xmm2 \n\t"\
+ "palignr $14,%%xmm0, %%xmm1 \n\t"\
+ "paddw %%xmm3, %%xmm2 \n\t"\
+ "paddw %%xmm4, %%xmm1 \n\t"\
+ "psllw $2, %%xmm2 \n\t"\
+ "psubw %%xmm1, %%xmm2 \n\t"\
+ "paddw %6, %%xmm5 \n\t"\
+ "pmullw %%xmm6, %%xmm2 \n\t"\
+ "paddw %%xmm5, %%xmm2 \n\t"\
+ "psraw $5, %%xmm2 \n\t"\
+ "packuswb %%xmm2, %%xmm2 \n\t"\
+ OP(%%xmm2, (%1), %%xmm4, q)\
+ "add %3, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+g"(h)\
+ : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
+ "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+}\
+static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+ src += 8*srcStride;\
+ dst += 8*dstStride;\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ src -= 2*srcStride;\
+ \
+ __asm__ volatile(\
+ "pxor %%xmm7, %%xmm7 \n\t"\
+ "movq (%0), %%xmm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movq (%0), %%xmm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movq (%0), %%xmm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movq (%0), %%xmm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movq (%0), %%xmm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm1 \n\t"\
+ "punpcklbw %%xmm7, %%xmm2 \n\t"\
+ "punpcklbw %%xmm7, %%xmm3 \n\t"\
+ "punpcklbw %%xmm7, %%xmm4 \n\t"\
+ QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+ QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+ QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+ QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+ QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
+ QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
+ QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+ QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ if(h==16){\
+ __asm__ volatile(\
+ QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+ QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+ QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
+ QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
+ QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+ QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+ QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+ QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ }\
+}\
+static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}
+
+static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
+ int w = (size+8)>>3;
+ src -= 2*srcStride+2;
+ while(w--){
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7 \n\t"
+ "movq (%0), %%xmm0 \n\t"
+ "add %2, %0 \n\t"
+ "movq (%0), %%xmm1 \n\t"
+ "add %2, %0 \n\t"
+ "movq (%0), %%xmm2 \n\t"
+ "add %2, %0 \n\t"
+ "movq (%0), %%xmm3 \n\t"
+ "add %2, %0 \n\t"
+ "movq (%0), %%xmm4 \n\t"
+ "add %2, %0 \n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "punpcklbw %%xmm7, %%xmm2 \n\t"
+ "punpcklbw %%xmm7, %%xmm3 \n\t"
+ "punpcklbw %%xmm7, %%xmm4 \n\t"
+ QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
+ QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
+ QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
+ QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
+ QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
+ QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
+ QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
+ QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
+ : "+a"(src)
+ : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
+ : "memory"
+ );
+ if(size==16){
+ __asm__ volatile(
+ QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
+ QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
+ QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
+ QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
+ QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
+ QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
+ QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
+ QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
+ : "+a"(src)
+ : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
+ : "memory"
+ );
+ }
+ tmp += 8;
+ src += 8 - (size+5)*srcStride;
+ }
+}
+
+#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+ int h = size;\
+ if(size == 16){\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movdqa 32(%0), %%xmm4 \n\t"\
+ "movdqa 16(%0), %%xmm5 \n\t"\
+ "movdqa (%0), %%xmm7 \n\t"\
+ "movdqa %%xmm4, %%xmm3 \n\t"\
+ "movdqa %%xmm4, %%xmm2 \n\t"\
+ "movdqa %%xmm4, %%xmm1 \n\t"\
+ "movdqa %%xmm4, %%xmm0 \n\t"\
+ "palignr $10, %%xmm5, %%xmm0 \n\t"\
+ "palignr $8, %%xmm5, %%xmm1 \n\t"\
+ "palignr $6, %%xmm5, %%xmm2 \n\t"\
+ "palignr $4, %%xmm5, %%xmm3 \n\t"\
+ "palignr $2, %%xmm5, %%xmm4 \n\t"\
+ "paddw %%xmm5, %%xmm0 \n\t"\
+ "paddw %%xmm4, %%xmm1 \n\t"\
+ "paddw %%xmm3, %%xmm2 \n\t"\
+ "movdqa %%xmm5, %%xmm6 \n\t"\
+ "movdqa %%xmm5, %%xmm4 \n\t"\
+ "movdqa %%xmm5, %%xmm3 \n\t"\
+ "palignr $8, %%xmm7, %%xmm4 \n\t"\
+ "palignr $2, %%xmm7, %%xmm6 \n\t"\
+ "palignr $10, %%xmm7, %%xmm3 \n\t"\
+ "paddw %%xmm6, %%xmm4 \n\t"\
+ "movdqa %%xmm5, %%xmm6 \n\t"\
+ "palignr $6, %%xmm7, %%xmm5 \n\t"\
+ "palignr $4, %%xmm7, %%xmm6 \n\t"\
+ "paddw %%xmm7, %%xmm3 \n\t"\
+ "paddw %%xmm6, %%xmm5 \n\t"\
+ \
+ "psubw %%xmm1, %%xmm0 \n\t"\
+ "psubw %%xmm4, %%xmm3 \n\t"\
+ "psraw $2, %%xmm0 \n\t"\
+ "psraw $2, %%xmm3 \n\t"\
+ "psubw %%xmm1, %%xmm0 \n\t"\
+ "psubw %%xmm4, %%xmm3 \n\t"\
+ "paddw %%xmm2, %%xmm0 \n\t"\
+ "paddw %%xmm5, %%xmm3 \n\t"\
+ "psraw $2, %%xmm0 \n\t"\
+ "psraw $2, %%xmm3 \n\t"\
+ "paddw %%xmm2, %%xmm0 \n\t"\
+ "paddw %%xmm5, %%xmm3 \n\t"\
+ "psraw $6, %%xmm0 \n\t"\
+ "psraw $6, %%xmm3 \n\t"\
+ "packuswb %%xmm0, %%xmm3 \n\t"\
+ OP(%%xmm3, (%1), %%xmm7, dqa)\
+ "add $48, %0 \n\t"\
+ "add %3, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(tmp), "+c"(dst), "+g"(h)\
+ : "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+ }else{\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movdqa 16(%0), %%xmm1 \n\t"\
+ "movdqa (%0), %%xmm0 \n\t"\
+ "movdqa %%xmm1, %%xmm2 \n\t"\
+ "movdqa %%xmm1, %%xmm3 \n\t"\
+ "movdqa %%xmm1, %%xmm4 \n\t"\
+ "movdqa %%xmm1, %%xmm5 \n\t"\
+ "palignr $10, %%xmm0, %%xmm5 \n\t"\
+ "palignr $8, %%xmm0, %%xmm4 \n\t"\
+ "palignr $6, %%xmm0, %%xmm3 \n\t"\
+ "palignr $4, %%xmm0, %%xmm2 \n\t"\
+ "palignr $2, %%xmm0, %%xmm1 \n\t"\
+ "paddw %%xmm5, %%xmm0 \n\t"\
+ "paddw %%xmm4, %%xmm1 \n\t"\
+ "paddw %%xmm3, %%xmm2 \n\t"\
+ "psubw %%xmm1, %%xmm0 \n\t"\
+ "psraw $2, %%xmm0 \n\t"\
+ "psubw %%xmm1, %%xmm0 \n\t"\
+ "paddw %%xmm2, %%xmm0 \n\t"\
+ "psraw $2, %%xmm0 \n\t"\
+ "paddw %%xmm2, %%xmm0 \n\t"\
+ "psraw $6, %%xmm0 \n\t"\
+ "packuswb %%xmm0, %%xmm0 \n\t"\
+ OP(%%xmm0, (%1), %%xmm7, q)\
+ "add $48, %0 \n\t"\
+ "add %3, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(tmp), "+c"(dst), "+g"(h)\
+ : "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+ }\
+}
+
+#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+ put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
+ OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
+}\
+static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
+}\
+
+#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
+#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
+#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
+#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
+#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
+#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
+#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
+#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
+
+#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
+#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
+#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
+#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
+#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
+#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
+#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
+#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
+
+#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
+#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
+#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
+#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
+
+#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
+#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
+#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
+#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
+
+#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
+#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
+
+#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
+H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
+
+static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
+ put_pixels16_sse2(dst, src, stride, 16);
+}
+static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
+ avg_pixels16_sse2(dst, src, stride, 16);
+}
+#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
+#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
+
+#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
+}\
+
+#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
+}\
+
+#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
+}\
+
+#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\
+ OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
+}\
+
+#define H264_MC_4816(MMX)\
+H264_MC(put_, 4, MMX, 8)\
+H264_MC(put_, 8, MMX, 8)\
+H264_MC(put_, 16,MMX, 8)\
+H264_MC(avg_, 4, MMX, 8)\
+H264_MC(avg_, 8, MMX, 8)\
+H264_MC(avg_, 16,MMX, 8)\
+
+#define H264_MC_816(QPEL, XMM)\
+QPEL(put_, 8, XMM, 16)\
+QPEL(put_, 16,XMM, 16)\
+QPEL(avg_, 8, XMM, 16)\
+QPEL(avg_, 16,XMM, 16)\
+
+
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp " \n\t"\
+"pavgusb " #temp ", " #a " \n\t"\
+"mov" #size " " #a ", " #b " \n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp " \n\t"\
+"pavgb " #temp ", " #a " \n\t"\
+"mov" #size " " #a ", " #b " \n\t"
+
+#define PAVGB "pavgusb"
+QPEL_H264(put_, PUT_OP, 3dnow)
+QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
+#undef PAVGB
+#define PAVGB "pavgb"
+QPEL_H264(put_, PUT_OP, mmx2)
+QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
+QPEL_H264_V_XMM(put_, PUT_OP, sse2)
+QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
+QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
+QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
+#ifdef HAVE_SSSE3
+QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
+QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
+QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
+QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
+QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
+QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
+#endif
+#undef PAVGB
+
+H264_MC_4816(3dnow)
+H264_MC_4816(mmx2)
+H264_MC_816(H264_MC_V, sse2)
+H264_MC_816(H264_MC_HV, sse2)
+#ifdef HAVE_SSSE3
+H264_MC_816(H264_MC_H, ssse3)
+H264_MC_816(H264_MC_HV, ssse3)
+#endif
+
+
+#define H264_CHROMA_OP(S,D)
+#define H264_CHROMA_OP4(S,D,T)
+#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
+#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
+#define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
+#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
+#include "dsputil_h264_template_mmx.c"
+
+static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+ put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 1);
+}
+static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+ put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 0);
+}
+
+#undef H264_CHROMA_OP
+#undef H264_CHROMA_OP4
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC2_TMPL
+#undef H264_CHROMA_MC8_MV0
+
+#define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
+#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
+ "pavgb " #T ", " #D " \n\t"
+#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
+#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
+#define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
+#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
+#include "dsputil_h264_template_mmx.c"
+static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+ avg_h264_chroma_mc8_mmx2(dst, src, stride, h, x, y, 1);
+}
+#undef H264_CHROMA_OP
+#undef H264_CHROMA_OP4
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC2_TMPL
+#undef H264_CHROMA_MC8_MV0
+
+#define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
+#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
+ "pavgusb " #T ", " #D " \n\t"
+#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow
+#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow
+#define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
+#include "dsputil_h264_template_mmx.c"
+static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+ avg_h264_chroma_mc8_3dnow(dst, src, stride, h, x, y, 1);
+}
+#undef H264_CHROMA_OP
+#undef H264_CHROMA_OP4
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC8_MV0
+
+#ifdef HAVE_SSSE3
+#define AVG_OP(X)
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
+#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
+#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
+#include "dsputil_h264_template_ssse3.c"
+static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+ put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
+}
+static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+ put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
+}
+
+#undef AVG_OP
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC8_MV0
+#define AVG_OP(X) X
+#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
+#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
+#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
+#include "dsputil_h264_template_ssse3.c"
+static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+ avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
+}
+#undef AVG_OP
+#undef H264_CHROMA_MC8_TMPL
+#undef H264_CHROMA_MC4_TMPL
+#undef H264_CHROMA_MC8_MV0
+#endif
+
+/***********************************/
+/* weighted prediction */
+
+static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
+{
+ int x, y;
+ offset <<= log2_denom;
+ offset += (1 << log2_denom) >> 1;
+ __asm__ volatile(
+ "movd %0, %%mm4 \n\t"
+ "movd %1, %%mm5 \n\t"
+ "movd %2, %%mm6 \n\t"
+ "pshufw $0, %%mm4, %%mm4 \n\t"
+ "pshufw $0, %%mm5, %%mm5 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ :: "g"(weight), "g"(offset), "g"(log2_denom)
+ );
+ for(y=0; y<h; y+=2){
+ for(x=0; x<w; x+=4){
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "movd %1, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "pmullw %%mm4, %%mm0 \n\t"
+ "pmullw %%mm4, %%mm1 \n\t"
+ "paddsw %%mm5, %%mm0 \n\t"
+ "paddsw %%mm5, %%mm1 \n\t"
+ "psraw %%mm6, %%mm0 \n\t"
+ "psraw %%mm6, %%mm1 \n\t"
+ "packuswb %%mm7, %%mm0 \n\t"
+ "packuswb %%mm7, %%mm1 \n\t"
+ "movd %%mm0, %0 \n\t"
+ "movd %%mm1, %1 \n\t"
+ : "+m"(*(uint32_t*)(dst+x)),
+ "+m"(*(uint32_t*)(dst+x+stride))
+ );
+ }
+ dst += 2*stride;
+ }
+}
+
+static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
+{
+ int x, y;
+ offset = ((offset + 1) | 1) << log2_denom;
+ __asm__ volatile(
+ "movd %0, %%mm3 \n\t"
+ "movd %1, %%mm4 \n\t"
+ "movd %2, %%mm5 \n\t"
+ "movd %3, %%mm6 \n\t"
+ "pshufw $0, %%mm3, %%mm3 \n\t"
+ "pshufw $0, %%mm4, %%mm4 \n\t"
+ "pshufw $0, %%mm5, %%mm5 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
+ );
+ for(y=0; y<h; y++){
+ for(x=0; x<w; x+=4){
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "movd %1, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "pmullw %%mm3, %%mm0 \n\t"
+ "pmullw %%mm4, %%mm1 \n\t"
+ "paddsw %%mm1, %%mm0 \n\t"
+ "paddsw %%mm5, %%mm0 \n\t"
+ "psraw %%mm6, %%mm0 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "movd %%mm0, %0 \n\t"
+ : "+m"(*(uint32_t*)(dst+x))
+ : "m"(*(uint32_t*)(src+x))
+ );
+ }
+ src += stride;
+ dst += stride;
+ }
+}
+
+#define H264_WEIGHT(W,H) \
+static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
+ ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
+} \
+static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
+ ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16, 8)
+H264_WEIGHT( 8,16)
+H264_WEIGHT( 8, 8)
+H264_WEIGHT( 8, 4)
+H264_WEIGHT( 4, 8)
+H264_WEIGHT( 4, 4)
+H264_WEIGHT( 4, 2)
+
diff --git a/libavcodec/x86/idct_mmx.c b/libavcodec/x86/idct_mmx.c
new file mode 100644
index 0000000000..aed934c04d
--- /dev/null
+++ b/libavcodec/x86/idct_mmx.c
@@ -0,0 +1,605 @@
+/*
+ * idct_mmx.c
+ * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with mpeg2dec; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/common.h"
+#include "libavcodec/dsputil.h"
+
+#include "mmx.h"
+
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 6
+
+#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
+#define rounder(bias) {round (bias), round (bias)}
+
+
+#if 0
+/* C row IDCT - it is just here to document the MMXEXT and MMX versions */
+static inline void idct_row (int16_t * row, int offset,
+ int16_t * table, int32_t * rounder)
+{
+ int C1, C2, C3, C4, C5, C6, C7;
+ int a0, a1, a2, a3, b0, b1, b2, b3;
+
+ row += offset;
+
+ C1 = table[1];
+ C2 = table[2];
+ C3 = table[3];
+ C4 = table[4];
+ C5 = table[5];
+ C6 = table[6];
+ C7 = table[7];
+
+ a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
+ a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
+ a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
+ a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
+
+ b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+ b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+ b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+ b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+ row[0] = (a0 + b0) >> ROW_SHIFT;
+ row[1] = (a1 + b1) >> ROW_SHIFT;
+ row[2] = (a2 + b2) >> ROW_SHIFT;
+ row[3] = (a3 + b3) >> ROW_SHIFT;
+ row[4] = (a3 - b3) >> ROW_SHIFT;
+ row[5] = (a2 - b2) >> ROW_SHIFT;
+ row[6] = (a1 - b1) >> ROW_SHIFT;
+ row[7] = (a0 - b0) >> ROW_SHIFT;
+}
+#endif
+
+
+/* MMXEXT row IDCT */
+
+#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
+ c4, c6, c4, c6, \
+ c1, c3, -c1, -c5, \
+ c5, c7, c3, -c7, \
+ c4, -c6, c4, -c6, \
+ -c4, c2, c4, -c2, \
+ c5, -c1, c3, -c1, \
+ c7, c3, c7, -c5 }
+
+static inline void mmxext_row_head (int16_t * const row, const int offset,
+ const int16_t * const table)
+{
+ movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */
+
+ movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */
+ movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */
+
+ movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */
+ movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */
+
+ movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */
+ pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
+
+ pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */
+}
+
+static inline void mmxext_row (const int16_t * const table,
+ const int32_t * const rounder)
+{
+ movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */
+ pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
+
+ pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
+ pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */
+
+ movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */
+ pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */
+
+ paddd_m2r (*rounder, mm3); /* mm3 += rounder */
+ pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
+
+ pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */
+ paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */
+
+ pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
+ movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */
+
+ pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
+ paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */
+
+ paddd_m2r (*rounder, mm0); /* mm0 += rounder */
+ psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */
+
+ psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */
+ paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */
+
+ paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */
+ psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */
+
+ paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */
+ movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */
+
+ paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */
+ psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */
+}
+
+static inline void mmxext_row_tail (int16_t * const row, const int store)
+{
+ psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */
+
+ psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */
+
+ packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */
+
+ packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */
+
+ movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */
+ pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */
+
+ /* slot */
+
+ movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */
+}
+
+static inline void mmxext_row_mid (int16_t * const row, const int store,
+ const int offset,
+ const int16_t * const table)
+{
+ movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */
+ psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */
+
+ movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */
+ psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */
+
+ packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */
+ movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */
+
+ packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */
+ movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */
+
+ movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */
+ pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */
+
+ movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */
+ movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */
+
+ pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
+
+ movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */
+ pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */
+}
+
+
+/* MMX row IDCT */
+
+#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \
+ c4, c6, -c4, -c2, \
+ c1, c3, c3, -c7, \
+ c5, c7, -c1, -c5, \
+ c4, -c6, c4, -c2, \
+ -c4, c2, c4, -c6, \
+ c5, -c1, c7, -c5, \
+ c7, c3, c3, -c1 }
+
+static inline void mmx_row_head (int16_t * const row, const int offset,
+ const int16_t * const table)
+{
+ movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */
+
+ movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */
+ movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */
+
+ movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */
+ movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */
+
+ punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */
+
+ movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */
+ pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
+
+ movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */
+ punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */
+}
+
+static inline void mmx_row (const int16_t * const table,
+ const int32_t * const rounder)
+{
+ pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
+ punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */
+
+ pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
+ punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */
+
+ movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */
+ pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
+
+ paddd_m2r (*rounder, mm3); /* mm3 += rounder */
+ pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
+
+ pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
+ paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */
+
+ pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
+ movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */
+
+ pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
+ paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */
+
+ paddd_m2r (*rounder, mm0); /* mm0 += rounder */
+ psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */
+
+ psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */
+ paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */
+
+ paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */
+ psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */
+
+ paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */
+ movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */
+
+ paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */
+ psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */
+}
+
+static inline void mmx_row_tail (int16_t * const row, const int store)
+{
+ psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */
+
+ psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */
+
+ packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */
+
+ packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */
+
+ movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */
+ movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */
+
+ pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */
+
+ psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */
+
+ por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */
+
+ /* slot */
+
+ movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */
+}
+
+static inline void mmx_row_mid (int16_t * const row, const int store,
+ const int offset, const int16_t * const table)
+{
+ movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */
+ psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */
+
+ movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */
+ psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */
+
+ packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */
+ movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */
+
+ packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */
+ movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */
+
+ movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */
+ movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */
+
+ punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */
+ psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */
+
+ movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */
+ pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */
+
+ movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */
+ por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */
+
+ movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */
+ punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */
+
+ movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */
+ pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
+}
+
+
+#if 0
+/* C column IDCT - it is just here to document the MMXEXT and MMX versions */
+static inline void idct_col (int16_t * col, int offset)
+{
+/* multiplication - as implemented on mmx */
+#define F(c,x) (((c) * (x)) >> 16)
+
+/* saturation - it helps us handle torture test cases */
+#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
+
+ int16_t x0, x1, x2, x3, x4, x5, x6, x7;
+ int16_t y0, y1, y2, y3, y4, y5, y6, y7;
+ int16_t a0, a1, a2, a3, b0, b1, b2, b3;
+ int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
+
+ col += offset;
+
+ x0 = col[0*8];
+ x1 = col[1*8];
+ x2 = col[2*8];
+ x3 = col[3*8];
+ x4 = col[4*8];
+ x5 = col[5*8];
+ x6 = col[6*8];
+ x7 = col[7*8];
+
+ u04 = S (x0 + x4);
+ v04 = S (x0 - x4);
+ u26 = S (F (T2, x6) + x2);
+ v26 = S (F (T2, x2) - x6);
+
+ a0 = S (u04 + u26);
+ a1 = S (v04 + v26);
+ a2 = S (v04 - v26);
+ a3 = S (u04 - u26);
+
+ u17 = S (F (T1, x7) + x1);
+ v17 = S (F (T1, x1) - x7);
+ u35 = S (F (T3, x5) + x3);
+ v35 = S (F (T3, x3) - x5);
+
+ b0 = S (u17 + u35);
+ b3 = S (v17 - v35);
+ u12 = S (u17 - u35);
+ v12 = S (v17 + v35);
+ u12 = S (2 * F (C4, u12));
+ v12 = S (2 * F (C4, v12));
+ b1 = S (u12 + v12);
+ b2 = S (u12 - v12);
+
+ y0 = S (a0 + b0) >> COL_SHIFT;
+ y1 = S (a1 + b1) >> COL_SHIFT;
+ y2 = S (a2 + b2) >> COL_SHIFT;
+ y3 = S (a3 + b3) >> COL_SHIFT;
+
+ y4 = S (a3 - b3) >> COL_SHIFT;
+ y5 = S (a2 - b2) >> COL_SHIFT;
+ y6 = S (a1 - b1) >> COL_SHIFT;
+ y7 = S (a0 - b0) >> COL_SHIFT;
+
+ col[0*8] = y0;
+ col[1*8] = y1;
+ col[2*8] = y2;
+ col[3*8] = y3;
+ col[4*8] = y4;
+ col[5*8] = y5;
+ col[6*8] = y6;
+ col[7*8] = y7;
+}
+#endif
+
+
+/* MMX column IDCT */
+static inline void idct_col (int16_t * const col, const int offset)
+{
+#define T1 13036
+#define T2 27146
+#define T3 43790
+#define C4 23170
+
+ static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
+ static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
+ static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
+ static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
+
+ /* column code adapted from Peter Gubanov */
+ /* http://www.elecard.com/peter/idct.shtml */
+
+ movq_m2r (*t1_vector, mm0); /* mm0 = T1 */
+
+ movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */
+ movq_r2r (mm0, mm2); /* mm2 = T1 */
+
+ movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */
+ pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */
+
+ movq_m2r (*t3_vector, mm5); /* mm5 = T3 */
+ pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */
+
+ movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */
+ movq_r2r (mm5, mm7); /* mm7 = T3-1 */
+
+ movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */
+ psubsw_r2r (mm4, mm0); /* mm0 = v17 */
+
+ movq_m2r (*t2_vector, mm4); /* mm4 = T2 */
+ pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */
+
+ paddsw_r2r (mm2, mm1); /* mm1 = u17 */
+ pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */
+
+ /* slot */
+
+ movq_r2r (mm4, mm2); /* mm2 = T2 */
+ paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */
+
+ pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */
+ paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */
+
+ psubsw_r2r (mm6, mm5); /* mm5 = v35 */
+ paddsw_r2r (mm3, mm7); /* mm7 = u35 */
+
+ movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */
+ movq_r2r (mm0, mm6); /* mm6 = v17 */
+
+ pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */
+ psubsw_r2r (mm5, mm0); /* mm0 = b3 */
+
+ psubsw_r2r (mm3, mm4); /* mm4 = v26 */
+ paddsw_r2r (mm6, mm5); /* mm5 = v12 */
+
+ movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */
+ movq_r2r (mm1, mm6); /* mm6 = u17 */
+
+ paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */
+ paddsw_r2r (mm7, mm6); /* mm6 = b0 */
+
+ psubsw_r2r (mm7, mm1); /* mm1 = u12 */
+ movq_r2r (mm1, mm7); /* mm7 = u12 */
+
+ movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */
+ paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */
+
+ movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */
+ psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */
+
+ movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */
+ pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */
+
+ movq_r2r (mm4, mm6); /* mm6 = v26 */
+ pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */
+
+ movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */
+ movq_r2r (mm3, mm0); /* mm0 = x0 */
+
+ psubsw_r2r (mm5, mm3); /* mm3 = v04 */
+ paddsw_r2r (mm5, mm0); /* mm0 = u04 */
+
+ paddsw_r2r (mm3, mm4); /* mm4 = a1 */
+ movq_r2r (mm0, mm5); /* mm5 = u04 */
+
+ psubsw_r2r (mm6, mm3); /* mm3 = a2 */
+ paddsw_r2r (mm2, mm5); /* mm5 = a0 */
+
+ paddsw_r2r (mm1, mm1); /* mm1 = b1 */
+ psubsw_r2r (mm2, mm0); /* mm0 = a3 */
+
+ paddsw_r2r (mm7, mm7); /* mm7 = b2 */
+ movq_r2r (mm3, mm2); /* mm2 = a2 */
+
+ movq_r2r (mm4, mm6); /* mm6 = a1 */
+ paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */
+
+ psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */
+ paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */
+
+ psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */
+ psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */
+
+ movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */
+ psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */
+
+ psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */
+ movq_r2r (mm5, mm7); /* mm7 = a0 */
+
+ movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */
+ psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */
+
+ movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */
+ paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */
+
+ movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */
+ psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */
+
+ psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */
+ movq_r2r (mm0, mm3); /* mm3 = a3 */
+
+ movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */
+ psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */
+
+ psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */
+ paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */
+
+ movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */
+ psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */
+
+ movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */
+ psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */
+
+ movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */
+
+ movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */
+
+ movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */
+
+#undef T1
+#undef T2
+#undef T3
+#undef C4
+}
+
+
+static const int32_t rounder0[] ATTR_ALIGN(8) =
+ rounder ((1 << (COL_SHIFT - 1)) - 0.5);
+static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
+static const int32_t rounder1[] ATTR_ALIGN(8) =
+ rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */
+static const int32_t rounder7[] ATTR_ALIGN(8) =
+ rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */
+static const int32_t rounder2[] ATTR_ALIGN(8) =
+ rounder (0.60355339059); /* C2 * (C6+C2)/2 */
+static const int32_t rounder6[] ATTR_ALIGN(8) =
+ rounder (-0.25); /* C2 * (C6-C2)/2 */
+static const int32_t rounder3[] ATTR_ALIGN(8) =
+ rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */
+static const int32_t rounder5[] ATTR_ALIGN(8) =
+ rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */
+
+#undef COL_SHIFT
+#undef ROW_SHIFT
+
+#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
+void idct (int16_t * const block) \
+{ \
+ static const int16_t table04[] ATTR_ALIGN(16) = \
+ table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \
+ static const int16_t table17[] ATTR_ALIGN(16) = \
+ table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \
+ static const int16_t table26[] ATTR_ALIGN(16) = \
+ table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \
+ static const int16_t table35[] ATTR_ALIGN(16) = \
+ table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \
+ \
+ idct_row_head (block, 0*8, table04); \
+ idct_row (table04, rounder0); \
+ idct_row_mid (block, 0*8, 4*8, table04); \
+ idct_row (table04, rounder4); \
+ idct_row_mid (block, 4*8, 1*8, table17); \
+ idct_row (table17, rounder1); \
+ idct_row_mid (block, 1*8, 7*8, table17); \
+ idct_row (table17, rounder7); \
+ idct_row_mid (block, 7*8, 2*8, table26); \
+ idct_row (table26, rounder2); \
+ idct_row_mid (block, 2*8, 6*8, table26); \
+ idct_row (table26, rounder6); \
+ idct_row_mid (block, 6*8, 3*8, table35); \
+ idct_row (table35, rounder3); \
+ idct_row_mid (block, 3*8, 5*8, table35); \
+ idct_row (table35, rounder5); \
+ idct_row_tail (block, 5*8); \
+ \
+ idct_col (block, 0); \
+ idct_col (block, 4); \
+}
+
+void ff_mmx_idct(DCTELEM *block);
+void ff_mmxext_idct(DCTELEM *block);
+
+declare_idct (ff_mmxext_idct, mmxext_table,
+ mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
+
+declare_idct (ff_mmx_idct, mmx_table,
+ mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
+
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
new file mode 100644
index 0000000000..d4fdd7a54a
--- /dev/null
+++ b/libavcodec/x86/idct_mmx_xvid.c
@@ -0,0 +1,525 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ * - MMX and XMM forward discrete cosine transform -
+ *
+ * Copyright(C) 2001 Peter Ross <pross@xvid.org>
+ *
+ * Originally provided by Intel at AP-922
+ * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+ * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
+ * but in a limited edition.
+ * New macro implements a column part for precise iDCT
+ * The routine precision now satisfies IEEE standard 1180-1990.
+ *
+ * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
+ * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
+ *
+ * http://www.elecard.com/peter/idct.html
+ * http://www.linuxvideo.org/mpeg2dec/
+ *
+ * These examples contain code fragments for first stage iDCT 8x8
+ * (for rows) and first stage DCT 8x8 (for columns)
+ *
+ * conversion to gcc syntax by Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include "libavcodec/avcodec.h"
+
+//=============================================================================
+// Macros and other preprocessor constants
+//=============================================================================
+
+#define BITS_INV_ACC 5 // 4 or 5 for IEEE
+#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11
+#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6
+#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC))
+#define RND_INV_COL (16 * (BITS_INV_ACC - 3))
+#define RND_INV_CORR (RND_INV_COL - 1)
+
+#define BITS_FRW_ACC 3 // 2 or 3 for accuracy
+#define SHIFT_FRW_COL BITS_FRW_ACC
+#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
+#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1))
+
+
+//-----------------------------------------------------------------------------
+// Various memory constants (trigonometric values or rounding values)
+//-----------------------------------------------------------------------------
+
+
+DECLARE_ALIGNED(8, static const int16_t, tg_1_16[4*4]) = {
+ 13036,13036,13036,13036, // tg * (2<<16) + 0.5
+ 27146,27146,27146,27146, // tg * (2<<16) + 0.5
+ -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5
+ 23170,23170,23170,23170}; // cos * (2<<15) + 0.5
+
+DECLARE_ALIGNED(8, static const int32_t, rounder_0[2*8]) = {
+ 65536,65536,
+ 3597,3597,
+ 2260,2260,
+ 1203,1203,
+ 0,0,
+ 120,120,
+ 512,512,
+ 512,512};
+
+//-----------------------------------------------------------------------------
+//
+// The first stage iDCT 8x8 - inverse DCTs of rows
+//
+//-----------------------------------------------------------------------------
+// The 8-point inverse DCT direct algorithm
+//-----------------------------------------------------------------------------
+//
+// static const short w[32] = {
+// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
+// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
+// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
+// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
+// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
+// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
+// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
+// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
+//
+// #define DCT_8_INV_ROW(x, y)
+// {
+// int a0, a1, a2, a3, b0, b1, b2, b3;
+//
+// a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3];
+// a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7];
+// a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
+// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
+// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
+// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
+// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
+// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
+//
+// y[0] = SHIFT_ROUND ( a0 + b0 );
+// y[1] = SHIFT_ROUND ( a1 + b1 );
+// y[2] = SHIFT_ROUND ( a2 + b2 );
+// y[3] = SHIFT_ROUND ( a3 + b3 );
+// y[4] = SHIFT_ROUND ( a3 - b3 );
+// y[5] = SHIFT_ROUND ( a2 - b2 );
+// y[6] = SHIFT_ROUND ( a1 - b1 );
+// y[7] = SHIFT_ROUND ( a0 - b0 );
+// }
+//
+//-----------------------------------------------------------------------------
+//
+// In this implementation the outputs of the iDCT-1D are multiplied
+// for rows 0,4 - by cos_4_16,
+// for rows 1,7 - by cos_1_16,
+// for rows 2,6 - by cos_2_16,
+// for rows 3,5 - by cos_3_16
+// and are shifted to the left for better accuracy
+//
+// For the constants used,
+// FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
+//
+//-----------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------
+// Tables for mmx processors
+//-----------------------------------------------------------------------------
+
+// Table for rows 0,4 - constants are multiplied by cos_4_16
+DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx[32*4]) = {
+ 16384,16384,16384,-16384, // movq-> w06 w04 w02 w00
+ 21407,8867,8867,-21407, // w07 w05 w03 w01
+ 16384,-16384,16384,16384, // w14 w12 w10 w08
+ -8867,21407,-21407,-8867, // w15 w13 w11 w09
+ 22725,12873,19266,-22725, // w22 w20 w18 w16
+ 19266,4520,-4520,-12873, // w23 w21 w19 w17
+ 12873,4520,4520,19266, // w30 w28 w26 w24
+ -22725,19266,-12873,-22725, // w31 w29 w27 w25
+// Table for rows 1,7 - constants are multiplied by cos_1_16
+ 22725,22725,22725,-22725, // movq-> w06 w04 w02 w00
+ 29692,12299,12299,-29692, // w07 w05 w03 w01
+ 22725,-22725,22725,22725, // w14 w12 w10 w08
+ -12299,29692,-29692,-12299, // w15 w13 w11 w09
+ 31521,17855,26722,-31521, // w22 w20 w18 w16
+ 26722,6270,-6270,-17855, // w23 w21 w19 w17
+ 17855,6270,6270,26722, // w30 w28 w26 w24
+ -31521,26722,-17855,-31521, // w31 w29 w27 w25
+// Table for rows 2,6 - constants are multiplied by cos_2_16
+ 21407,21407,21407,-21407, // movq-> w06 w04 w02 w00
+ 27969,11585,11585,-27969, // w07 w05 w03 w01
+ 21407,-21407,21407,21407, // w14 w12 w10 w08
+ -11585,27969,-27969,-11585, // w15 w13 w11 w09
+ 29692,16819,25172,-29692, // w22 w20 w18 w16
+ 25172,5906,-5906,-16819, // w23 w21 w19 w17
+ 16819,5906,5906,25172, // w30 w28 w26 w24
+ -29692,25172,-16819,-29692, // w31 w29 w27 w25
+// Table for rows 3,5 - constants are multiplied by cos_3_16
+ 19266,19266,19266,-19266, // movq-> w06 w04 w02 w00
+ 25172,10426,10426,-25172, // w07 w05 w03 w01
+ 19266,-19266,19266,19266, // w14 w12 w10 w08
+ -10426,25172,-25172,-10426, // w15 w13 w11 w09
+ 26722,15137,22654,-26722, // w22 w20 w18 w16
+ 22654,5315,-5315,-15137, // w23 w21 w19 w17
+ 15137,5315,5315,22654, // w30 w28 w26 w24
+ -26722,22654,-15137,-26722, // w31 w29 w27 w25
+};
+//-----------------------------------------------------------------------------
+// Tables for xmm processors
+//-----------------------------------------------------------------------------
+
+// %3 for rows 0,4 - constants are multiplied by cos_4_16
+DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm[32*4]) = {
+ 16384,21407,16384,8867, // movq-> w05 w04 w01 w00
+ 16384,8867,-16384,-21407, // w07 w06 w03 w02
+ 16384,-8867,16384,-21407, // w13 w12 w09 w08
+ -16384,21407,16384,-8867, // w15 w14 w11 w10
+ 22725,19266,19266,-4520, // w21 w20 w17 w16
+ 12873,4520,-22725,-12873, // w23 w22 w19 w18
+ 12873,-22725,4520,-12873, // w29 w28 w25 w24
+ 4520,19266,19266,-22725, // w31 w30 w27 w26
+// %3 for rows 1,7 - constants are multiplied by cos_1_16
+ 22725,29692,22725,12299, // movq-> w05 w04 w01 w00
+ 22725,12299,-22725,-29692, // w07 w06 w03 w02
+ 22725,-12299,22725,-29692, // w13 w12 w09 w08
+ -22725,29692,22725,-12299, // w15 w14 w11 w10
+ 31521,26722,26722,-6270, // w21 w20 w17 w16
+ 17855,6270,-31521,-17855, // w23 w22 w19 w18
+ 17855,-31521,6270,-17855, // w29 w28 w25 w24
+ 6270,26722,26722,-31521, // w31 w30 w27 w26
+// %3 for rows 2,6 - constants are multiplied by cos_2_16
+ 21407,27969,21407,11585, // movq-> w05 w04 w01 w00
+ 21407,11585,-21407,-27969, // w07 w06 w03 w02
+ 21407,-11585,21407,-27969, // w13 w12 w09 w08
+ -21407,27969,21407,-11585, // w15 w14 w11 w10
+ 29692,25172,25172,-5906, // w21 w20 w17 w16
+ 16819,5906,-29692,-16819, // w23 w22 w19 w18
+ 16819,-29692,5906,-16819, // w29 w28 w25 w24
+ 5906,25172,25172,-29692, // w31 w30 w27 w26
+// %3 for rows 3,5 - constants are multiplied by cos_3_16
+ 19266,25172,19266,10426, // movq-> w05 w04 w01 w00
+ 19266,10426,-19266,-25172, // w07 w06 w03 w02
+ 19266,-10426,19266,-25172, // w13 w12 w09 w08
+ -19266,25172,19266,-10426, // w15 w14 w11 w10
+ 26722,22654,22654,-5315, // w21 w20 w17 w16
+ 15137,5315,-26722,-15137, // w23 w22 w19 w18
+ 15137,-26722,5315,-15137, // w29 w28 w25 w24
+ 5315,22654,22654,-26722, // w31 w30 w27 w26
+};
+//=============================================================================
+// Helper macros for the code
+//=============================================================================
+
+//-----------------------------------------------------------------------------
+// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
+//-----------------------------------------------------------------------------
+
+#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\
+ "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
+ "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
+ "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
+ "movq " #A3 ",%%mm3 \n\t"/* 3 ; w06 w04 w02 w00*/\
+ "punpcklwd %%mm1,%%mm0 \n\t"/* x5 x1 x4 x0*/\
+ "movq %%mm0,%%mm5 \n\t"/* 5 ; x5 x1 x4 x0*/\
+ "punpckldq %%mm0,%%mm0 \n\t"/* x4 x0 x4 x0*/\
+ "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w05 w03 w01*/\
+ "punpckhwd %%mm1,%%mm2 \n\t"/* 1 ; x7 x3 x6 x2*/\
+ "pmaddwd %%mm0,%%mm3 \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\
+ "movq %%mm2,%%mm6 \n\t"/* 6 ; x7 x3 x6 x2*/\
+ "movq 32+" #A3 ",%%mm1 \n\t"/* 1 ; w22 w20 w18 w16*/\
+ "punpckldq %%mm2,%%mm2 \n\t"/* x6 x2 x6 x2*/\
+ "pmaddwd %%mm2,%%mm4 \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\
+ "punpckhdq %%mm5,%%mm5 \n\t"/* x5 x1 x5 x1*/\
+ "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\
+ "punpckhdq %%mm6,%%mm6 \n\t"/* x7 x3 x7 x3*/\
+ "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w21 w19 w17*/\
+ "pmaddwd %%mm5,%%mm1 \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\
+ "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
+ "pmaddwd %%mm6,%%mm7 \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\
+ "pmaddwd 24+" #A3 ",%%mm2 \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\
+ "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
+ "pmaddwd 48+" #A3 ",%%mm5 \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\
+ "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
+ "pmaddwd 56+" #A3 ",%%mm6 \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\
+ "paddd %%mm7,%%mm1 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
+ "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
+ "psubd %%mm1,%%mm3 \n\t"/* a1-b1 a0-b0*/\
+ "psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\
+ "paddd %%mm4,%%mm1 \n\t"/* 4 ; a1+b1 a0+b0*/\
+ "paddd %%mm2,%%mm0 \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\
+ "psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\
+ "paddd %%mm6,%%mm5 \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\
+ "movq %%mm0,%%mm4 \n\t"/* 4 ; a3 a2*/\
+ "paddd %%mm5,%%mm0 \n\t"/* a3+b3 a2+b2*/\
+ "psubd %%mm5,%%mm4 \n\t"/* 5 ; a3-b3 a2-b2*/\
+ "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
+ "psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\
+ "packssdw %%mm0,%%mm1 \n\t"/* 0 ; y3 y2 y1 y0*/\
+ "packssdw %%mm3,%%mm4 \n\t"/* 3 ; y6 y7 y4 y5*/\
+ "movq %%mm4,%%mm7 \n\t"/* 7 ; y6 y7 y4 y5*/\
+ "psrld $16,%%mm4 \n\t"/* 0 y6 0 y4*/\
+ "pslld $16,%%mm7 \n\t"/* y7 0 y5 0*/\
+ "movq %%mm1," #A2 " \n\t"/* 1 ; save y3 y2 y1 y0*/\
+ "por %%mm4,%%mm7 \n\t"/* 4 ; y7 y6 y5 y4*/\
+ "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
+
+
+//-----------------------------------------------------------------------------
+// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
+//-----------------------------------------------------------------------------
+
+#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\
+ "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
+ "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
+ "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
+ "movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\
+ "pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\
+ "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\
+ "movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\
+ "pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\
+ "movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\
+ "pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\
+ "pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\
+ "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\
+ "pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\
+ "pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\
+ "pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\
+ "pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\
+ "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
+ "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\
+ "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
+ "pmaddwd 24+" #A3 ",%%mm1 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\
+ "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
+ "pmaddwd 48+" #A3 ",%%mm2 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\
+ "paddd %%mm7,%%mm6 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
+ "pmaddwd 56+" #A3 ",%%mm5 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\
+ "paddd %%mm6,%%mm3 \n\t"/* a1+b1 a0+b0*/\
+ "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
+ "psrad $11,%%mm3 \n\t"/* y1=a1+b1 y0=a0+b0*/\
+ "paddd %%mm1,%%mm0 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\
+ "psubd %%mm6,%%mm4 \n\t"/* 6 ; a1-b1 a0-b0*/\
+ "movq %%mm0,%%mm7 \n\t"/* 7 ; a3 a2*/\
+ "paddd %%mm5,%%mm2 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\
+ "paddd %%mm2,%%mm0 \n\t"/* a3+b3 a2+b2*/\
+ "psrad $11,%%mm4 \n\t"/* y6=a1-b1 y7=a0-b0*/\
+ "psubd %%mm2,%%mm7 \n\t"/* 2 ; a3-b3 a2-b2*/\
+ "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
+ "psrad $11,%%mm7 \n\t"/* y4=a3-b3 y5=a2-b2*/\
+ "packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\
+ "packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\
+ "movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\
+ "pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\
+ "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
+
+
+//-----------------------------------------------------------------------------
+//
+// The first stage DCT 8x8 - forward DCTs of columns
+//
+// The %2puts are multiplied
+// for rows 0,4 - on cos_4_16,
+// for rows 1,7 - on cos_1_16,
+// for rows 2,6 - on cos_2_16,
+// for rows 3,5 - on cos_3_16
+// and are shifted to the left for rise of accuracy
+//
+//-----------------------------------------------------------------------------
+//
+// The 8-point scaled forward DCT algorithm (26a8m)
+//
+//-----------------------------------------------------------------------------
+//
+// #define DCT_8_FRW_COL(x, y)
+//{
+// short t0, t1, t2, t3, t4, t5, t6, t7;
+// short tp03, tm03, tp12, tm12, tp65, tm65;
+// short tp465, tm465, tp765, tm765;
+//
+// t0 = LEFT_SHIFT ( x[0] + x[7] );
+// t1 = LEFT_SHIFT ( x[1] + x[6] );
+// t2 = LEFT_SHIFT ( x[2] + x[5] );
+// t3 = LEFT_SHIFT ( x[3] + x[4] );
+// t4 = LEFT_SHIFT ( x[3] - x[4] );
+// t5 = LEFT_SHIFT ( x[2] - x[5] );
+// t6 = LEFT_SHIFT ( x[1] - x[6] );
+// t7 = LEFT_SHIFT ( x[0] - x[7] );
+//
+// tp03 = t0 + t3;
+// tm03 = t0 - t3;
+// tp12 = t1 + t2;
+// tm12 = t1 - t2;
+//
+// y[0] = tp03 + tp12;
+// y[4] = tp03 - tp12;
+//
+// y[2] = tm03 + tm12 * tg_2_16;
+// y[6] = tm03 * tg_2_16 - tm12;
+//
+// tp65 =(t6 +t5 )*cos_4_16;
+// tm65 =(t6 -t5 )*cos_4_16;
+//
+// tp765 = t7 + tp65;
+// tm765 = t7 - tp65;
+// tp465 = t4 + tm65;
+// tm465 = t4 - tm65;
+//
+// y[1] = tp765 + tp465 * tg_1_16;
+// y[7] = tp765 * tg_1_16 - tp465;
+// y[5] = tm765 * tg_3_16 + tm465;
+// y[3] = tm765 - tm465 * tg_3_16;
+//}
+//
+//-----------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------
+// DCT_8_INV_COL_4 INP,OUT
+//-----------------------------------------------------------------------------
+
+#define DCT_8_INV_COL(A1,A2)\
+ "movq 2*8(%3),%%mm0\n\t"\
+ "movq 16*3+" #A1 ",%%mm3\n\t"\
+ "movq %%mm0,%%mm1 \n\t"/* tg_3_16*/\
+ "movq 16*5+" #A1 ",%%mm5\n\t"\
+ "pmulhw %%mm3,%%mm0 \n\t"/* x3*(tg_3_16-1)*/\
+ "movq (%3),%%mm4\n\t"\
+ "pmulhw %%mm5,%%mm1 \n\t"/* x5*(tg_3_16-1)*/\
+ "movq 16*7+" #A1 ",%%mm7\n\t"\
+ "movq %%mm4,%%mm2 \n\t"/* tg_1_16*/\
+ "movq 16*1+" #A1 ",%%mm6\n\t"\
+ "pmulhw %%mm7,%%mm4 \n\t"/* x7*tg_1_16*/\
+ "paddsw %%mm3,%%mm0 \n\t"/* x3*tg_3_16*/\
+ "pmulhw %%mm6,%%mm2 \n\t"/* x1*tg_1_16*/\
+ "paddsw %%mm3,%%mm1 \n\t"/* x3+x5*(tg_3_16-1)*/\
+ "psubsw %%mm5,%%mm0 \n\t"/* x3*tg_3_16-x5 = tm35*/\
+ "movq 3*8(%3),%%mm3\n\t"\
+ "paddsw %%mm5,%%mm1 \n\t"/* x3+x5*tg_3_16 = tp35*/\
+ "paddsw %%mm6,%%mm4 \n\t"/* x1+tg_1_16*x7 = tp17*/\
+ "psubsw %%mm7,%%mm2 \n\t"/* x1*tg_1_16-x7 = tm17*/\
+ "movq %%mm4,%%mm5 \n\t"/* tp17*/\
+ "movq %%mm2,%%mm6 \n\t"/* tm17*/\
+ "paddsw %%mm1,%%mm5 \n\t"/* tp17+tp35 = b0*/\
+ "psubsw %%mm0,%%mm6 \n\t"/* tm17-tm35 = b3*/\
+ "psubsw %%mm1,%%mm4 \n\t"/* tp17-tp35 = t1*/\
+ "paddsw %%mm0,%%mm2 \n\t"/* tm17+tm35 = t2*/\
+ "movq 1*8(%3),%%mm7\n\t"\
+ "movq %%mm4,%%mm1 \n\t"/* t1*/\
+ "movq %%mm5,3*16 +" #A2 "\n\t"/* save b0*/\
+ "paddsw %%mm2,%%mm1 \n\t"/* t1+t2*/\
+ "movq %%mm6,5*16 +" #A2 "\n\t"/* save b3*/\
+ "psubsw %%mm2,%%mm4 \n\t"/* t1-t2*/\
+ "movq 2*16+" #A1 ",%%mm5\n\t"\
+ "movq %%mm7,%%mm0 \n\t"/* tg_2_16*/\
+ "movq 6*16+" #A1 ",%%mm6\n\t"\
+ "pmulhw %%mm5,%%mm0 \n\t"/* x2*tg_2_16*/\
+ "pmulhw %%mm6,%%mm7 \n\t"/* x6*tg_2_16*/\
+ "pmulhw %%mm3,%%mm1 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\
+ "movq 0*16+" #A1 ",%%mm2\n\t"\
+ "pmulhw %%mm3,%%mm4 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\
+ "psubsw %%mm6,%%mm0 \n\t"/* t2*tg_2_16-x6 = tm26*/\
+ "movq %%mm2,%%mm3 \n\t"/* x0*/\
+ "movq 4*16+" #A1 ",%%mm6\n\t"\
+ "paddsw %%mm5,%%mm7 \n\t"/* x2+x6*tg_2_16 = tp26*/\
+ "paddsw %%mm6,%%mm2 \n\t"/* x0+x4 = tp04*/\
+ "psubsw %%mm6,%%mm3 \n\t"/* x0-x4 = tm04*/\
+ "movq %%mm2,%%mm5 \n\t"/* tp04*/\
+ "movq %%mm3,%%mm6 \n\t"/* tm04*/\
+ "psubsw %%mm7,%%mm2 \n\t"/* tp04-tp26 = a3*/\
+ "paddsw %%mm0,%%mm3 \n\t"/* tm04+tm26 = a1*/\
+ "paddsw %%mm1,%%mm1 \n\t"/* b1*/\
+ "paddsw %%mm4,%%mm4 \n\t"/* b2*/\
+ "paddsw %%mm7,%%mm5 \n\t"/* tp04+tp26 = a0*/\
+ "psubsw %%mm0,%%mm6 \n\t"/* tm04-tm26 = a2*/\
+ "movq %%mm3,%%mm7 \n\t"/* a1*/\
+ "movq %%mm6,%%mm0 \n\t"/* a2*/\
+ "paddsw %%mm1,%%mm3 \n\t"/* a1+b1*/\
+ "paddsw %%mm4,%%mm6 \n\t"/* a2+b2*/\
+ "psraw $6,%%mm3 \n\t"/* dst1*/\
+ "psubsw %%mm1,%%mm7 \n\t"/* a1-b1*/\
+ "psraw $6,%%mm6 \n\t"/* dst2*/\
+ "psubsw %%mm4,%%mm0 \n\t"/* a2-b2*/\
+ "movq 3*16+" #A2 ",%%mm1 \n\t"/* load b0*/\
+ "psraw $6,%%mm7 \n\t"/* dst6*/\
+ "movq %%mm5,%%mm4 \n\t"/* a0*/\
+ "psraw $6,%%mm0 \n\t"/* dst5*/\
+ "movq %%mm3,1*16+" #A2 "\n\t"\
+ "paddsw %%mm1,%%mm5 \n\t"/* a0+b0*/\
+ "movq %%mm6,2*16+" #A2 "\n\t"\
+ "psubsw %%mm1,%%mm4 \n\t"/* a0-b0*/\
+ "movq 5*16+" #A2 ",%%mm3 \n\t"/* load b3*/\
+ "psraw $6,%%mm5 \n\t"/* dst0*/\
+ "movq %%mm2,%%mm6 \n\t"/* a3*/\
+ "psraw $6,%%mm4 \n\t"/* dst7*/\
+ "movq %%mm0,5*16+" #A2 "\n\t"\
+ "paddsw %%mm3,%%mm2 \n\t"/* a3+b3*/\
+ "movq %%mm7,6*16+" #A2 "\n\t"\
+ "psubsw %%mm3,%%mm6 \n\t"/* a3-b3*/\
+ "movq %%mm5,0*16+" #A2 "\n\t"\
+ "psraw $6,%%mm2 \n\t"/* dst3*/\
+ "movq %%mm4,7*16+" #A2 "\n\t"\
+ "psraw $6,%%mm6 \n\t"/* dst4*/\
+ "movq %%mm2,3*16+" #A2 "\n\t"\
+ "movq %%mm6,4*16+" #A2 "\n\t"
+
+//=============================================================================
+// Code
+//=============================================================================
+
+//-----------------------------------------------------------------------------
+// void idct_mmx(uint16_t block[64]);
+//-----------------------------------------------------------------------------
+
+
+void ff_idct_xvid_mmx(short *block){
+__asm__ volatile(
+ //# Process each row
+ DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
+ DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
+ DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
+ DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
+ DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
+ DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
+ DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
+ DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
+
+ //# Process the columns (4 at a time)
+ DCT_8_INV_COL(0(%0), 0(%0))
+ DCT_8_INV_COL(8(%0), 8(%0))
+ :: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16));
+}
+
+//-----------------------------------------------------------------------------
+// void idct_xmm(uint16_t block[64]);
+//-----------------------------------------------------------------------------
+
+
+void ff_idct_xvid_mmx2(short *block){
+__asm__ volatile(
+ //# Process each row
+ DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
+ DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
+ DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
+ DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
+ DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
+ DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
+ DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
+ DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
+
+ //# Process the columns (4 at a time)
+ DCT_8_INV_COL(0(%0), 0(%0))
+ DCT_8_INV_COL(8(%0), 8(%0))
+ :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
+}
+
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
new file mode 100644
index 0000000000..d8711a2ee2
--- /dev/null
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -0,0 +1,394 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ * - SSE2 inverse discrete cosine transform -
+ *
+ * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
+ *
+ * Conversion to gcc syntax with modifications
+ * by Alexander Strange <astrange@ithinksw.com>
+ *
+ * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
+ *
+ * This file is part of FFmpeg.
+ *
+ * Vertical pass is an implementation of the scheme:
+ * Loeffler C., Ligtenberg A., and Moschytz C.S.:
+ * Practical Fast 1D DCT Algorithm with Eleven Multiplications,
+ * Proc. ICASSP 1989, 988-991.
+ *
+ * Horizontal pass is a double 4x4 vector/matrix multiplication,
+ * (see also Intel's Application Note 922:
+ * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
+ * Copyright (C) 1999 Intel Corporation)
+ *
+ * More details at http://skal.planet-d.net/coding/dct.html
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+#include "idct_xvid.h"
+
+/*!
+ * @file idct_sse2_xvid.c
+ * @brief SSE2 idct compatible with xvidmmx
+ */
+
+#define X8(x) x,x,x,x,x,x,x,x
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 6
+
+DECLARE_ASM_CONST(16, int16_t, tan1[]) = {X8(13036)}; // tan( pi/16)
+DECLARE_ASM_CONST(16, int16_t, tan2[]) = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
+DECLARE_ASM_CONST(16, int16_t, tan3[]) = {X8(43790)}; // tan(3pi/16)-1
+DECLARE_ASM_CONST(16, int16_t, sqrt2[])= {X8(23170)}; // 0.5/sqrt(2)
+DECLARE_ASM_CONST(8, uint8_t, m127[]) = {X8(127)};
+
+DECLARE_ASM_CONST(16, int16_t, iTab1[]) = {
+ 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
+ 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
+ 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
+ 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab2[]) = {
+ 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
+ 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
+ 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
+ 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab3[]) = {
+ 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
+ 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
+ 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
+ 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
+};
+
+DECLARE_ASM_CONST(16, int16_t, iTab4[]) = {
+ 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
+ 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
+ 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
+ 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
+};
+
+DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders[]) = {
+ 65536, 65536, 65536, 65536,
+ 3597, 3597, 3597, 3597,
+ 2260, 2260, 2260, 2260,
+ 1203, 1203, 1203, 1203,
+ 120, 120, 120, 120,
+ 512, 512, 512, 512
+};
+
+// Temporary storage before the column pass
+#define ROW1 "%%xmm6"
+#define ROW3 "%%xmm4"
+#define ROW5 "%%xmm5"
+#define ROW7 "%%xmm7"
+
+#define CLEAR_ODD(r) "pxor "r","r" \n\t"
+#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
+
+#ifdef ARCH_X86_64
+
+# define ROW0 "%%xmm8"
+# define REG0 ROW0
+# define ROW2 "%%xmm9"
+# define REG2 ROW2
+# define ROW4 "%%xmm10"
+# define REG4 ROW4
+# define ROW6 "%%xmm11"
+# define REG6 ROW6
+# define CLEAR_EVEN(r) CLEAR_ODD(r)
+# define PUT_EVEN(dst) PUT_ODD(dst)
+# define XMMS "%%xmm12"
+# define MOV_32_ONLY "#"
+# define SREG2 REG2
+# define TAN3 "%%xmm13"
+# define TAN1 "%%xmm14"
+
+#else
+
+# define ROW0 "(%0)"
+# define REG0 "%%xmm4"
+# define ROW2 "2*16(%0)"
+# define REG2 "%%xmm4"
+# define ROW4 "4*16(%0)"
+# define REG4 "%%xmm6"
+# define ROW6 "6*16(%0)"
+# define REG6 "%%xmm6"
+# define CLEAR_EVEN(r)
+# define PUT_EVEN(dst) \
+ "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
+ "movdqa %%xmm2, "dst" \n\t"
+# define XMMS "%%xmm2"
+# define MOV_32_ONLY "movdqa "
+# define SREG2 "%%xmm7"
+# define TAN3 "%%xmm0"
+# define TAN1 "%%xmm2"
+
+#endif
+
+#define ROUND(x) "paddd "MANGLE(x)
+
+#define JZ(reg, to) \
+ "testl "reg","reg" \n\t" \
+ "jz "to" \n\t"
+
+#define JNZ(reg, to) \
+ "testl "reg","reg" \n\t" \
+ "jnz "to" \n\t"
+
+#define TEST_ONE_ROW(src, reg, clear) \
+ clear \
+ "movq "src", %%mm1 \n\t" \
+ "por 8+"src", %%mm1 \n\t" \
+ "paddusb %%mm0, %%mm1 \n\t" \
+ "pmovmskb %%mm1, "reg" \n\t"
+
+#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
+ clear1 \
+ clear2 \
+ "movq "row1", %%mm1 \n\t" \
+ "por 8+"row1", %%mm1 \n\t" \
+ "movq "row2", %%mm2 \n\t" \
+ "por 8+"row2", %%mm2 \n\t" \
+ "paddusb %%mm0, %%mm1 \n\t" \
+ "paddusb %%mm0, %%mm2 \n\t" \
+ "pmovmskb %%mm1, "reg1" \n\t" \
+ "pmovmskb %%mm2, "reg2" \n\t"
+
+///IDCT pass on rows.
+#define iMTX_MULT(src, table, rounder, put) \
+ "movdqa "src", %%xmm3 \n\t" \
+ "movdqa %%xmm3, %%xmm0 \n\t" \
+ "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \
+ "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \
+ "pmaddwd "table", %%xmm0 \n\t" \
+ "pmaddwd 16+"table", %%xmm1 \n\t" \
+ "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \
+ "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \
+ "pmaddwd 32+"table", %%xmm2 \n\t" \
+ "pmaddwd 48+"table", %%xmm3 \n\t" \
+ "paddd %%xmm1, %%xmm0 \n\t" \
+ "paddd %%xmm3, %%xmm2 \n\t" \
+ rounder", %%xmm0 \n\t" \
+ "movdqa %%xmm2, %%xmm3 \n\t" \
+ "paddd %%xmm0, %%xmm2 \n\t" \
+ "psubd %%xmm3, %%xmm0 \n\t" \
+ "psrad $11, %%xmm2 \n\t" \
+ "psrad $11, %%xmm0 \n\t" \
+ "packssdw %%xmm0, %%xmm2 \n\t" \
+ put \
+ "1: \n\t"
+
+#define iLLM_HEAD \
+ "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
+ "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
+
+///IDCT pass on columns.
+#define iLLM_PASS(dct) \
+ "movdqa "TAN3", %%xmm1 \n\t" \
+ "movdqa "TAN1", %%xmm3 \n\t" \
+ "pmulhw %%xmm4, "TAN3" \n\t" \
+ "pmulhw %%xmm5, %%xmm1 \n\t" \
+ "paddsw %%xmm4, "TAN3" \n\t" \
+ "paddsw %%xmm5, %%xmm1 \n\t" \
+ "psubsw %%xmm5, "TAN3" \n\t" \
+ "paddsw %%xmm4, %%xmm1 \n\t" \
+ "pmulhw %%xmm7, %%xmm3 \n\t" \
+ "pmulhw %%xmm6, "TAN1" \n\t" \
+ "paddsw %%xmm6, %%xmm3 \n\t" \
+ "psubsw %%xmm7, "TAN1" \n\t" \
+ "movdqa %%xmm3, %%xmm7 \n\t" \
+ "movdqa "TAN1", %%xmm6 \n\t" \
+ "psubsw %%xmm1, %%xmm3 \n\t" \
+ "psubsw "TAN3", "TAN1" \n\t" \
+ "paddsw %%xmm7, %%xmm1 \n\t" \
+ "paddsw %%xmm6, "TAN3" \n\t" \
+ "movdqa %%xmm3, %%xmm6 \n\t" \
+ "psubsw "TAN3", %%xmm3 \n\t" \
+ "paddsw %%xmm6, "TAN3" \n\t" \
+ "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
+ "pmulhw %%xmm4, %%xmm3 \n\t" \
+ "pmulhw %%xmm4, "TAN3" \n\t" \
+ "paddsw "TAN3", "TAN3" \n\t" \
+ "paddsw %%xmm3, %%xmm3 \n\t" \
+ "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
+ MOV_32_ONLY ROW2", "REG2" \n\t" \
+ MOV_32_ONLY ROW6", "REG6" \n\t" \
+ "movdqa %%xmm7, %%xmm5 \n\t" \
+ "pmulhw "REG6", %%xmm7 \n\t" \
+ "pmulhw "REG2", %%xmm5 \n\t" \
+ "paddsw "REG2", %%xmm7 \n\t" \
+ "psubsw "REG6", %%xmm5 \n\t" \
+ MOV_32_ONLY ROW0", "REG0" \n\t" \
+ MOV_32_ONLY ROW4", "REG4" \n\t" \
+ MOV_32_ONLY" "TAN1", (%0) \n\t" \
+ "movdqa "REG0", "XMMS" \n\t" \
+ "psubsw "REG4", "REG0" \n\t" \
+ "paddsw "XMMS", "REG4" \n\t" \
+ "movdqa "REG4", "XMMS" \n\t" \
+ "psubsw %%xmm7, "REG4" \n\t" \
+ "paddsw "XMMS", %%xmm7 \n\t" \
+ "movdqa "REG0", "XMMS" \n\t" \
+ "psubsw %%xmm5, "REG0" \n\t" \
+ "paddsw "XMMS", %%xmm5 \n\t" \
+ "movdqa %%xmm5, "XMMS" \n\t" \
+ "psubsw "TAN3", %%xmm5 \n\t" \
+ "paddsw "XMMS", "TAN3" \n\t" \
+ "movdqa "REG0", "XMMS" \n\t" \
+ "psubsw %%xmm3, "REG0" \n\t" \
+ "paddsw "XMMS", %%xmm3 \n\t" \
+ MOV_32_ONLY" (%0), "TAN1" \n\t" \
+ "psraw $6, %%xmm5 \n\t" \
+ "psraw $6, "REG0" \n\t" \
+ "psraw $6, "TAN3" \n\t" \
+ "psraw $6, %%xmm3 \n\t" \
+ "movdqa "TAN3", 1*16("dct") \n\t" \
+ "movdqa %%xmm3, 2*16("dct") \n\t" \
+ "movdqa "REG0", 5*16("dct") \n\t" \
+ "movdqa %%xmm5, 6*16("dct") \n\t" \
+ "movdqa %%xmm7, %%xmm0 \n\t" \
+ "movdqa "REG4", %%xmm4 \n\t" \
+ "psubsw %%xmm1, %%xmm7 \n\t" \
+ "psubsw "TAN1", "REG4" \n\t" \
+ "paddsw %%xmm0, %%xmm1 \n\t" \
+ "paddsw %%xmm4, "TAN1" \n\t" \
+ "psraw $6, %%xmm1 \n\t" \
+ "psraw $6, %%xmm7 \n\t" \
+ "psraw $6, "TAN1" \n\t" \
+ "psraw $6, "REG4" \n\t" \
+ "movdqa %%xmm1, ("dct") \n\t" \
+ "movdqa "TAN1", 3*16("dct") \n\t" \
+ "movdqa "REG4", 4*16("dct") \n\t" \
+ "movdqa %%xmm7, 7*16("dct") \n\t"
+
+///IDCT pass on columns, assuming rows 4-7 are zero.
+#define iLLM_PASS_SPARSE(dct) \
+ "pmulhw %%xmm4, "TAN3" \n\t" \
+ "paddsw %%xmm4, "TAN3" \n\t" \
+ "movdqa %%xmm6, %%xmm3 \n\t" \
+ "pmulhw %%xmm6, "TAN1" \n\t" \
+ "movdqa %%xmm4, %%xmm1 \n\t" \
+ "psubsw %%xmm1, %%xmm3 \n\t" \
+ "paddsw %%xmm6, %%xmm1 \n\t" \
+ "movdqa "TAN1", %%xmm6 \n\t" \
+ "psubsw "TAN3", "TAN1" \n\t" \
+ "paddsw %%xmm6, "TAN3" \n\t" \
+ "movdqa %%xmm3, %%xmm6 \n\t" \
+ "psubsw "TAN3", %%xmm3 \n\t" \
+ "paddsw %%xmm6, "TAN3" \n\t" \
+ "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
+ "pmulhw %%xmm4, %%xmm3 \n\t" \
+ "pmulhw %%xmm4, "TAN3" \n\t" \
+ "paddsw "TAN3", "TAN3" \n\t" \
+ "paddsw %%xmm3, %%xmm3 \n\t" \
+ "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
+ MOV_32_ONLY ROW2", "SREG2" \n\t" \
+ "pmulhw "SREG2", %%xmm5 \n\t" \
+ MOV_32_ONLY ROW0", "REG0" \n\t" \
+ "movdqa "REG0", %%xmm6 \n\t" \
+ "psubsw "SREG2", %%xmm6 \n\t" \
+ "paddsw "REG0", "SREG2" \n\t" \
+ MOV_32_ONLY" "TAN1", (%0) \n\t" \
+ "movdqa "REG0", "XMMS" \n\t" \
+ "psubsw %%xmm5, "REG0" \n\t" \
+ "paddsw "XMMS", %%xmm5 \n\t" \
+ "movdqa %%xmm5, "XMMS" \n\t" \
+ "psubsw "TAN3", %%xmm5 \n\t" \
+ "paddsw "XMMS", "TAN3" \n\t" \
+ "movdqa "REG0", "XMMS" \n\t" \
+ "psubsw %%xmm3, "REG0" \n\t" \
+ "paddsw "XMMS", %%xmm3 \n\t" \
+ MOV_32_ONLY" (%0), "TAN1" \n\t" \
+ "psraw $6, %%xmm5 \n\t" \
+ "psraw $6, "REG0" \n\t" \
+ "psraw $6, "TAN3" \n\t" \
+ "psraw $6, %%xmm3 \n\t" \
+ "movdqa "TAN3", 1*16("dct") \n\t" \
+ "movdqa %%xmm3, 2*16("dct") \n\t" \
+ "movdqa "REG0", 5*16("dct") \n\t" \
+ "movdqa %%xmm5, 6*16("dct") \n\t" \
+ "movdqa "SREG2", %%xmm0 \n\t" \
+ "movdqa %%xmm6, %%xmm4 \n\t" \
+ "psubsw %%xmm1, "SREG2" \n\t" \
+ "psubsw "TAN1", %%xmm6 \n\t" \
+ "paddsw %%xmm0, %%xmm1 \n\t" \
+ "paddsw %%xmm4, "TAN1" \n\t" \
+ "psraw $6, %%xmm1 \n\t" \
+ "psraw $6, "SREG2" \n\t" \
+ "psraw $6, "TAN1" \n\t" \
+ "psraw $6, %%xmm6 \n\t" \
+ "movdqa %%xmm1, ("dct") \n\t" \
+ "movdqa "TAN1", 3*16("dct") \n\t" \
+ "movdqa %%xmm6, 4*16("dct") \n\t" \
+ "movdqa "SREG2", 7*16("dct") \n\t"
+
+inline void ff_idct_xvid_sse2(short *block)
+{
+ __asm__ volatile(
+ "movq "MANGLE(m127)", %%mm0 \n\t"
+ iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
+ iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
+ iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
+
+ TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
+ JZ("%%eax", "1f")
+ iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
+
+ TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
+ TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
+ iLLM_HEAD
+ ASMALIGN(4)
+ JNZ("%%ecx", "2f")
+ JNZ("%%eax", "3f")
+ JNZ("%%edx", "4f")
+ JNZ("%%esi", "5f")
+ iLLM_PASS_SPARSE("%0")
+ "jmp 6f \n\t"
+ "2: \n\t"
+ iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
+ "3: \n\t"
+ iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
+ JZ("%%edx", "1f")
+ "4: \n\t"
+ iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
+ JZ("%%esi", "1f")
+ "5: \n\t"
+ iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
+#ifndef ARCH_X86_64
+ iLLM_HEAD
+#endif
+ iLLM_PASS("%0")
+ "6: \n\t"
+ : "+r"(block)
+ :
+ : "%eax", "%ecx", "%edx", "%esi", "memory");
+}
+
+void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
+{
+ ff_idct_xvid_sse2(block);
+ put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
+{
+ ff_idct_xvid_sse2(block);
+ add_pixels_clamped_mmx(block, dest, line_size);
+}
diff --git a/libavcodec/x86/idct_xvid.h b/libavcodec/x86/idct_xvid.h
new file mode 100644
index 0000000000..bddbdb95c0
--- /dev/null
+++ b/libavcodec/x86/idct_xvid.h
@@ -0,0 +1,37 @@
+/*
+ * XVID MPEG-4 VIDEO CODEC
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*!
+ * @file idct_xvid.h
+ * header for Xvid IDCT functions
+ */
+
+#ifndef AVCODEC_X86_IDCT_XVID_H
+#define AVCODEC_X86_IDCT_XVID_H
+
+#include <stdint.h>
+
+void ff_idct_xvid_mmx(short *block);
+void ff_idct_xvid_mmx2(short *block);
+void ff_idct_xvid_sse2(short *block);
+void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block);
+void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block);
+
+#endif /* AVCODEC_X86_IDCT_XVID_H */
diff --git a/libavcodec/x86/mathops.h b/libavcodec/x86/mathops.h
new file mode 100644
index 0000000000..95377acab8
--- /dev/null
+++ b/libavcodec/x86/mathops.h
@@ -0,0 +1,43 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_MATHOPS_H
+#define AVCODEC_X86_MATHOPS_H
+
+#define MULL(ra, rb, shift) \
+ ({ int rt, dummy; __asm__ (\
+ "imull %3 \n\t"\
+ "shrdl %4, %%edx, %%eax \n\t"\
+ : "=a"(rt), "=d"(dummy)\
+ : "a" ((int)ra), "rm" ((int)rb), "i"(shift));\
+ rt; })
+
+#define MULH(ra, rb) \
+ ({ int rt, dummy;\
+ __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\
+ rt; })
+
+#define MUL64(ra, rb) \
+ ({ int64_t rt;\
+ __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\
+ rt; })
+
+#endif /* AVCODEC_X86_MATHOPS_H */
diff --git a/libavcodec/x86/mmx.h b/libavcodec/x86/mmx.h
new file mode 100644
index 0000000000..d7a76bbd7d
--- /dev/null
+++ b/libavcodec/x86/mmx.h
@@ -0,0 +1,267 @@
+/*
+ * mmx.h
+ * Copyright (C) 1997-2001 H. Dietz and R. Fisher
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVCODEC_X86_MMX_H
+#define AVCODEC_X86_MMX_H
+
+#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected.
+
+
+#define mmx_i2r(op,imm,reg) \
+ __asm__ volatile (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "i" (imm) )
+
+#define mmx_m2r(op,mem,reg) \
+ __asm__ volatile (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "m" (mem))
+
+#define mmx_r2m(op,reg,mem) \
+ __asm__ volatile (#op " %%" #reg ", %0" \
+ : "=m" (mem) \
+ : /* nothing */ )
+
+#define mmx_r2r(op,regs,regd) \
+ __asm__ volatile (#op " %" #regs ", %" #regd)
+
+
+#define emms() __asm__ volatile ("emms")
+
+#define movd_m2r(var,reg) mmx_m2r (movd, var, reg)
+#define movd_r2m(reg,var) mmx_r2m (movd, reg, var)
+#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd)
+
+#define movq_m2r(var,reg) mmx_m2r (movq, var, reg)
+#define movq_r2m(reg,var) mmx_r2m (movq, reg, var)
+#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd)
+
+#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg)
+#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
+#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg)
+#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
+
+#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg)
+#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
+
+#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg)
+#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd)
+#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg)
+#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd)
+#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg)
+#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd)
+
+#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg)
+#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd)
+#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg)
+#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd)
+
+#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg)
+#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd)
+#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg)
+#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd)
+
+#define pand_m2r(var,reg) mmx_m2r (pand, var, reg)
+#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd)
+
+#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg)
+#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd)
+
+#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg)
+#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd)
+#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg)
+#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd)
+#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg)
+#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd)
+
+#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg)
+#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd)
+#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg)
+#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd)
+#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg)
+#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd)
+
+#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg)
+#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd)
+
+#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg)
+#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd)
+
+#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg)
+#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd)
+
+#define por_m2r(var,reg) mmx_m2r (por, var, reg)
+#define por_r2r(regs,regd) mmx_r2r (por, regs, regd)
+
+#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg)
+#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg)
+#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd)
+#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg)
+#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg)
+#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd)
+#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg)
+#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg)
+#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd)
+
+#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg)
+#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg)
+#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd)
+#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg)
+#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg)
+#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd)
+
+#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg)
+#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg)
+#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd)
+#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg)
+#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg)
+#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd)
+#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg)
+#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg)
+#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd)
+
+#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg)
+#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd)
+#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg)
+#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd)
+#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg)
+#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd)
+
+#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg)
+#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd)
+#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg)
+#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd)
+
+#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg)
+#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd)
+#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg)
+#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd)
+
+#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg)
+#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd)
+#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg)
+#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd)
+#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg)
+#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd)
+
+#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg)
+#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd)
+#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg)
+#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd)
+#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg)
+#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd)
+
+#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg)
+#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd)
+
+
+/* 3DNOW extensions */
+
+#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg)
+#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd)
+
+
+/* AMD MMX extensions - also available in intel SSE */
+
+
+#define mmx_m2ri(op,mem,reg,imm) \
+ __asm__ volatile (#op " %1, %0, %%" #reg \
+ : /* nothing */ \
+ : "m" (mem), "i" (imm))
+#define mmx_r2ri(op,regs,regd,imm) \
+ __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \
+ : /* nothing */ \
+ : "i" (imm) )
+
+#define mmx_fetch(mem,hint) \
+ __asm__ volatile ("prefetch" #hint " %0" \
+ : /* nothing */ \
+ : "m" (mem))
+
+
+#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg)
+
+#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var)
+
+#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg)
+#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd)
+#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg)
+#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd)
+
+#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm)
+
+#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm)
+
+#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg)
+#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd)
+
+#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg)
+#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd)
+
+#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg)
+#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd)
+
+#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg)
+#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd)
+
+#define pmovmskb(mmreg,reg) \
+ __asm__ volatile ("movmskps %" #mmreg ", %" #reg)
+
+#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg)
+#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd)
+
+#define prefetcht0(mem) mmx_fetch (mem, t0)
+#define prefetcht1(mem) mmx_fetch (mem, t1)
+#define prefetcht2(mem) mmx_fetch (mem, t2)
+#define prefetchnta(mem) mmx_fetch (mem, nta)
+
+#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg)
+#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd)
+
+#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm)
+#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm)
+
+#define sfence() __asm__ volatile ("sfence\n\t")
+
+/* SSE2 */
+#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm)
+#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm)
+#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm)
+#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm)
+
+#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm)
+
+#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg)
+#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var)
+#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd)
+#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg)
+#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var)
+#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd)
+
+#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var)
+
+#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg)
+#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg)
+
+#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd)
+#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd)
+
+
+#endif /* AVCODEC_X86_MMX_H */
diff --git a/libavcodec/x86/motion_est_mmx.c b/libavcodec/x86/motion_est_mmx.c
new file mode 100644
index 0000000000..c866e8ad7b
--- /dev/null
+++ b/libavcodec/x86/motion_est_mmx.c
@@ -0,0 +1,461 @@
+/*
+ * MMX optimized motion estimation
+ * Copyright (c) 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer
+ *
+ * mostly by Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+
+DECLARE_ASM_CONST(8, uint64_t, round_tab[3])={
+0x0000000000000000ULL,
+0x0001000100010001ULL,
+0x0002000200020002ULL,
+};
+
+DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
+
+static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ x86_reg len= -(stride*h);
+ __asm__ volatile(
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq (%2, %%"REG_a"), %%mm2 \n\t"
+ "movq (%2, %%"REG_a"), %%mm4 \n\t"
+ "add %3, %%"REG_a" \n\t"
+ "psubusb %%mm0, %%mm2 \n\t"
+ "psubusb %%mm4, %%mm0 \n\t"
+ "movq (%1, %%"REG_a"), %%mm1 \n\t"
+ "movq (%2, %%"REG_a"), %%mm3 \n\t"
+ "movq (%2, %%"REG_a"), %%mm5 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm5, %%mm1 \n\t"
+ "por %%mm2, %%mm0 \n\t"
+ "por %%mm1, %%mm3 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm3, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm2 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "add %3, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ : "+a" (len)
+ : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
+ );
+}
+
+static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ __asm__ volatile(
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "psadbw (%2), %%mm0 \n\t"
+ "psadbw (%2, %3), %%mm1 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "paddw %%mm1, %%mm6 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%2,%3,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2)
+ : "r" ((x86_reg)stride)
+ );
+}
+
+static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
+{
+ int ret;
+ __asm__ volatile(
+ "pxor %%xmm6, %%xmm6 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movdqu (%1), %%xmm0 \n\t"
+ "movdqu (%1, %3), %%xmm1 \n\t"
+ "psadbw (%2), %%xmm0 \n\t"
+ "psadbw (%2, %3), %%xmm1 \n\t"
+ "paddw %%xmm0, %%xmm6 \n\t"
+ "paddw %%xmm1, %%xmm6 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%2,%3,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2)
+ : "r" ((x86_reg)stride)
+ );
+ __asm__ volatile(
+ "movhlps %%xmm6, %%xmm0 \n\t"
+ "paddw %%xmm0, %%xmm6 \n\t"
+ "movd %%xmm6, %0 \n\t"
+ : "=r"(ret)
+ );
+ return ret;
+}
+
+static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ __asm__ volatile(
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "movq (%1, %3), %%mm1 \n\t"
+ "pavgb 1(%1), %%mm0 \n\t"
+ "pavgb 1(%1, %3), %%mm1 \n\t"
+ "psadbw (%2), %%mm0 \n\t"
+ "psadbw (%2, %3), %%mm1 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "paddw %%mm1, %%mm6 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%2,%3,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2)
+ : "r" ((x86_reg)stride)
+ );
+}
+
+static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "add %3, %1 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq (%1, %3), %%mm2 \n\t"
+ "pavgb %%mm1, %%mm0 \n\t"
+ "pavgb %%mm2, %%mm1 \n\t"
+ "psadbw (%2), %%mm0 \n\t"
+ "psadbw (%2, %3), %%mm1 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "paddw %%mm1, %%mm6 \n\t"
+ "movq %%mm2, %%mm0 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%2,%3,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2)
+ : "r" ((x86_reg)stride)
+ );
+}
+
+static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ __asm__ volatile(
+ "movq "MANGLE(bone)", %%mm5 \n\t"
+ "movq (%1), %%mm0 \n\t"
+ "pavgb 1(%1), %%mm0 \n\t"
+ "add %3, %1 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq (%1,%3), %%mm2 \n\t"
+ "pavgb 1(%1), %%mm1 \n\t"
+ "pavgb 1(%1,%3), %%mm2 \n\t"
+ "psubusb %%mm5, %%mm1 \n\t"
+ "pavgb %%mm1, %%mm0 \n\t"
+ "pavgb %%mm2, %%mm1 \n\t"
+ "psadbw (%2), %%mm0 \n\t"
+ "psadbw (%2,%3), %%mm1 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "paddw %%mm1, %%mm6 \n\t"
+ "movq %%mm2, %%mm0 \n\t"
+ "lea (%1,%3,2), %1 \n\t"
+ "lea (%2,%3,2), %2 \n\t"
+ "sub $2, %0 \n\t"
+ " jg 1b \n\t"
+ : "+r" (h), "+r" (blk1), "+r" (blk2)
+ : "r" ((x86_reg)stride)
+ );
+}
+
+static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
+{
+ x86_reg len= -(stride*h);
+ __asm__ volatile(
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq (%2, %%"REG_a"), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm2 \n\t"
+ "movq (%2, %%"REG_a"), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm0, %%mm1 \n\t"
+ "paddw %%mm2, %%mm3 \n\t"
+ "movq (%3, %%"REG_a"), %%mm4 \n\t"
+ "movq (%3, %%"REG_a"), %%mm2 \n\t"
+ "paddw %%mm5, %%mm1 \n\t"
+ "paddw %%mm5, %%mm3 \n\t"
+ "psrlw $1, %%mm1 \n\t"
+ "psrlw $1, %%mm3 \n\t"
+ "packuswb %%mm3, %%mm1 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm2, %%mm1 \n\t"
+ "por %%mm4, %%mm1 \n\t"
+ "movq %%mm1, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "add %4, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ : "+a" (len)
+ : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
+ );
+}
+
+static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ x86_reg len= -(stride*h);
+ __asm__ volatile(
+ "movq (%1, %%"REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "paddw %%mm3, %%mm1 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%2, %%"REG_a"), %%mm2 \n\t"
+ "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddw %%mm4, %%mm2 \n\t"
+ "paddw %%mm5, %%mm3 \n\t"
+ "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "paddw %%mm3, %%mm1 \n\t"
+ "paddw %%mm5, %%mm0 \n\t"
+ "paddw %%mm5, %%mm1 \n\t"
+ "movq (%3, %%"REG_a"), %%mm4 \n\t"
+ "movq (%3, %%"REG_a"), %%mm5 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "psubusb %%mm0, %%mm4 \n\t"
+ "psubusb %%mm5, %%mm0 \n\t"
+ "por %%mm4, %%mm0 \n\t"
+ "movq %%mm0, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpckhbw %%mm7, %%mm4 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "paddw %%mm4, %%mm6 \n\t"
+ "movq %%mm2, %%mm0 \n\t"
+ "movq %%mm3, %%mm1 \n\t"
+ "add %4, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ : "+a" (len)
+ : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
+ );
+}
+
+static inline int sum_mmx(void)
+{
+ int ret;
+ __asm__ volatile(
+ "movq %%mm6, %%mm0 \n\t"
+ "psrlq $32, %%mm6 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "movq %%mm6, %%mm0 \n\t"
+ "psrlq $16, %%mm6 \n\t"
+ "paddw %%mm0, %%mm6 \n\t"
+ "movd %%mm6, %0 \n\t"
+ : "=r" (ret)
+ );
+ return ret&0xFFFF;
+}
+
+static inline int sum_mmx2(void)
+{
+ int ret;
+ __asm__ volatile(
+ "movd %%mm6, %0 \n\t"
+ : "=r" (ret)
+ );
+ return ret;
+}
+
+static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
+}
+static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+{
+ sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
+}
+
+
+#define PIX_SAD(suf)\
+static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ assert(h==8);\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t":);\
+\
+ sad8_1_ ## suf(blk1, blk2, stride, 8);\
+\
+ return sum_ ## suf();\
+}\
+static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ assert(h==8);\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ :: "m"(round_tab[1]) \
+ );\
+\
+ sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
+\
+ return sum_ ## suf();\
+}\
+\
+static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ assert(h==8);\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ :: "m"(round_tab[1]) \
+ );\
+\
+ sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
+\
+ return sum_ ## suf();\
+}\
+\
+static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ assert(h==8);\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ ::);\
+\
+ sad8_4_ ## suf(blk1, blk2, stride, 8);\
+\
+ return sum_ ## suf();\
+}\
+\
+static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t":);\
+\
+ sad8_1_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
+\
+ return sum_ ## suf();\
+}\
+static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ :: "m"(round_tab[1]) \
+ );\
+\
+ sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
+\
+ return sum_ ## suf();\
+}\
+static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ :: "m"(round_tab[1]) \
+ );\
+\
+ sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
+\
+ return sum_ ## suf();\
+}\
+static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+ __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ ::);\
+\
+ sad8_4_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
+\
+ return sum_ ## suf();\
+}\
+
+PIX_SAD(mmx)
+PIX_SAD(mmx2)
+
+void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
+{
+ if (mm_flags & FF_MM_MMX) {
+ c->pix_abs[0][0] = sad16_mmx;
+ c->pix_abs[0][1] = sad16_x2_mmx;
+ c->pix_abs[0][2] = sad16_y2_mmx;
+ c->pix_abs[0][3] = sad16_xy2_mmx;
+ c->pix_abs[1][0] = sad8_mmx;
+ c->pix_abs[1][1] = sad8_x2_mmx;
+ c->pix_abs[1][2] = sad8_y2_mmx;
+ c->pix_abs[1][3] = sad8_xy2_mmx;
+
+ c->sad[0]= sad16_mmx;
+ c->sad[1]= sad8_mmx;
+ }
+ if (mm_flags & FF_MM_MMXEXT) {
+ c->pix_abs[0][0] = sad16_mmx2;
+ c->pix_abs[1][0] = sad8_mmx2;
+
+ c->sad[0]= sad16_mmx2;
+ c->sad[1]= sad8_mmx2;
+
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->pix_abs[0][1] = sad16_x2_mmx2;
+ c->pix_abs[0][2] = sad16_y2_mmx2;
+ c->pix_abs[0][3] = sad16_xy2_mmx2;
+ c->pix_abs[1][1] = sad8_x2_mmx2;
+ c->pix_abs[1][2] = sad8_y2_mmx2;
+ c->pix_abs[1][3] = sad8_xy2_mmx2;
+ }
+ }
+ if ((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)) {
+ c->sad[0]= sad16_sse2;
+ }
+}
diff --git a/libavcodec/x86/mpegvideo_mmx.c b/libavcodec/x86/mpegvideo_mmx.c
new file mode 100644
index 0000000000..406d3b8c07
--- /dev/null
+++ b/libavcodec/x86/mpegvideo_mmx.c
@@ -0,0 +1,654 @@
+/*
+ * The simplest mpeg encoder (well, it was the simplest!)
+ * Copyright (c) 2000,2001 Fabrice Bellard.
+ *
+ * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
+ * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+#include "dsputil_mmx.h"
+
+extern uint16_t inv_zigzag_direct16[64];
+
+
+static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ x86_reg level, qmul, qadd, nCoeffs;
+
+ qmul = qscale << 1;
+
+ assert(s->block_last_index[n]>=0 || s->h263_aic);
+
+ if (!s->h263_aic) {
+ if (n < 4)
+ level = block[0] * s->y_dc_scale;
+ else
+ level = block[0] * s->c_dc_scale;
+ qadd = (qscale - 1) | 1;
+ }else{
+ qadd = 0;
+ level= block[0];
+ }
+ if(s->ac_pred)
+ nCoeffs=63;
+ else
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+//printf("%d %d ", qmul, qadd);
+__asm__ volatile(
+ "movd %1, %%mm6 \n\t" //qmul
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "movd %2, %%mm5 \n\t" //qadd
+ "pxor %%mm7, %%mm7 \n\t"
+ "packssdw %%mm5, %%mm5 \n\t"
+ "packssdw %%mm5, %%mm5 \n\t"
+ "psubw %%mm5, %%mm7 \n\t"
+ "pxor %%mm4, %%mm4 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%0, %3), %%mm0 \n\t"
+ "movq 8(%0, %3), %%mm1 \n\t"
+
+ "pmullw %%mm6, %%mm0 \n\t"
+ "pmullw %%mm6, %%mm1 \n\t"
+
+ "movq (%0, %3), %%mm2 \n\t"
+ "movq 8(%0, %3), %%mm3 \n\t"
+
+ "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+
+ "paddw %%mm7, %%mm0 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+
+ "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
+
+ "pandn %%mm2, %%mm0 \n\t"
+ "pandn %%mm3, %%mm1 \n\t"
+
+ "movq %%mm0, (%0, %3) \n\t"
+ "movq %%mm1, 8(%0, %3) \n\t"
+
+ "add $16, %3 \n\t"
+ "jng 1b \n\t"
+ ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
+ : "memory"
+ );
+ block[0]= level;
+}
+
+
+static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ x86_reg qmul, qadd, nCoeffs;
+
+ qmul = qscale << 1;
+ qadd = (qscale - 1) | 1;
+
+ assert(s->block_last_index[n]>=0 || s->h263_aic);
+
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+//printf("%d %d ", qmul, qadd);
+__asm__ volatile(
+ "movd %1, %%mm6 \n\t" //qmul
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "movd %2, %%mm5 \n\t" //qadd
+ "pxor %%mm7, %%mm7 \n\t"
+ "packssdw %%mm5, %%mm5 \n\t"
+ "packssdw %%mm5, %%mm5 \n\t"
+ "psubw %%mm5, %%mm7 \n\t"
+ "pxor %%mm4, %%mm4 \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%0, %3), %%mm0 \n\t"
+ "movq 8(%0, %3), %%mm1 \n\t"
+
+ "pmullw %%mm6, %%mm0 \n\t"
+ "pmullw %%mm6, %%mm1 \n\t"
+
+ "movq (%0, %3), %%mm2 \n\t"
+ "movq 8(%0, %3), %%mm3 \n\t"
+
+ "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+
+ "paddw %%mm7, %%mm0 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+
+ "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
+
+ "pandn %%mm2, %%mm0 \n\t"
+ "pandn %%mm3, %%mm1 \n\t"
+
+ "movq %%mm0, (%0, %3) \n\t"
+ "movq %%mm1, 8(%0, %3) \n\t"
+
+ "add $16, %3 \n\t"
+ "jng 1b \n\t"
+ ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
+ : "memory"
+ );
+}
+
+
+/*
+ NK:
+ Note: looking at PARANOID:
+ "enable all paranoid tests for rounding, overflows, etc..."
+
+#ifdef PARANOID
+ if (level < -2048 || level > 2047)
+ fprintf(stderr, "unquant error %d %d\n", i, level);
+#endif
+ We can suppose that result of two multiplications can't be greater than 0xFFFF
+ i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
+ a complex multiplication.
+=====================================================
+ Full formula for multiplication of 2 integer numbers
+ which are represent as high:low words:
+ input: value1 = high1:low1
+ value2 = high2:low2
+ output: value3 = value1*value2
+ value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
+ this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
+ but this algorithm will compute only 0x66cb0ce4
+ this limited by 16-bit size of operands
+ ---------------------------------
+ tlow1 = high1*low2
+ tlow2 = high2*low1
+ tlow1 = tlow1 + tlow2
+ high3:low3 = low1*low2
+ high3 += tlow1
+*/
+static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ x86_reg nCoeffs;
+ const uint16_t *quant_matrix;
+ int block0;
+
+ assert(s->block_last_index[n]>=0);
+
+ nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
+
+ if (n < 4)
+ block0 = block[0] * s->y_dc_scale;
+ else
+ block0 = block[0] * s->c_dc_scale;
+ /* XXX: only mpeg1 */
+ quant_matrix = s->intra_matrix;
+__asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $15, %%mm7 \n\t"
+ "movd %2, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "mov %3, %%"REG_a" \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm4 \n\t"
+ "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
+ "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
+ "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
+ "pxor %%mm2, %%mm2 \n\t"
+ "pxor %%mm3, %%mm3 \n\t"
+ "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
+ "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
+ "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
+ "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
+ "pxor %%mm4, %%mm4 \n\t"
+ "pxor %%mm5, %%mm5 \n\t" // FIXME slow
+ "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+ "psraw $3, %%mm0 \n\t"
+ "psraw $3, %%mm1 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+ "psubw %%mm7, %%mm1 \n\t"
+ "por %%mm7, %%mm0 \n\t"
+ "por %%mm7, %%mm1 \n\t"
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm3, %%mm1 \n\t"
+ "pandn %%mm0, %%mm4 \n\t"
+ "pandn %%mm1, %%mm5 \n\t"
+ "movq %%mm4, (%0, %%"REG_a") \n\t"
+ "movq %%mm5, 8(%0, %%"REG_a") \n\t"
+
+ "add $16, %%"REG_a" \n\t"
+ "js 1b \n\t"
+ ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
+ : "%"REG_a, "memory"
+ );
+ block[0]= block0;
+}
+
+static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ x86_reg nCoeffs;
+ const uint16_t *quant_matrix;
+
+ assert(s->block_last_index[n]>=0);
+
+ nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
+
+ quant_matrix = s->inter_matrix;
+__asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $15, %%mm7 \n\t"
+ "movd %2, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "mov %3, %%"REG_a" \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm4 \n\t"
+ "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
+ "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
+ "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
+ "pxor %%mm2, %%mm2 \n\t"
+ "pxor %%mm3, %%mm3 \n\t"
+ "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
+ "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
+ "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
+ "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
+ "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
+ "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
+ "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
+ "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
+ "pxor %%mm4, %%mm4 \n\t"
+ "pxor %%mm5, %%mm5 \n\t" // FIXME slow
+ "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+ "psraw $4, %%mm0 \n\t"
+ "psraw $4, %%mm1 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+ "psubw %%mm7, %%mm1 \n\t"
+ "por %%mm7, %%mm0 \n\t"
+ "por %%mm7, %%mm1 \n\t"
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm3, %%mm1 \n\t"
+ "pandn %%mm0, %%mm4 \n\t"
+ "pandn %%mm1, %%mm5 \n\t"
+ "movq %%mm4, (%0, %%"REG_a") \n\t"
+ "movq %%mm5, 8(%0, %%"REG_a") \n\t"
+
+ "add $16, %%"REG_a" \n\t"
+ "js 1b \n\t"
+ ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
+ : "%"REG_a, "memory"
+ );
+}
+
+static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ x86_reg nCoeffs;
+ const uint16_t *quant_matrix;
+ int block0;
+
+ assert(s->block_last_index[n]>=0);
+
+ if(s->alternate_scan) nCoeffs= 63; //FIXME
+ else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+
+ if (n < 4)
+ block0 = block[0] * s->y_dc_scale;
+ else
+ block0 = block[0] * s->c_dc_scale;
+ quant_matrix = s->intra_matrix;
+__asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $15, %%mm7 \n\t"
+ "movd %2, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "mov %3, %%"REG_a" \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm4 \n\t"
+ "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
+ "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
+ "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
+ "pxor %%mm2, %%mm2 \n\t"
+ "pxor %%mm3, %%mm3 \n\t"
+ "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
+ "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
+ "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
+ "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
+ "pxor %%mm4, %%mm4 \n\t"
+ "pxor %%mm5, %%mm5 \n\t" // FIXME slow
+ "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+ "psraw $3, %%mm0 \n\t"
+ "psraw $3, %%mm1 \n\t"
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm3, %%mm1 \n\t"
+ "pandn %%mm0, %%mm4 \n\t"
+ "pandn %%mm1, %%mm5 \n\t"
+ "movq %%mm4, (%0, %%"REG_a") \n\t"
+ "movq %%mm5, 8(%0, %%"REG_a") \n\t"
+
+ "add $16, %%"REG_a" \n\t"
+ "jng 1b \n\t"
+ ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
+ : "%"REG_a, "memory"
+ );
+ block[0]= block0;
+ //Note, we do not do mismatch control for intra as errors cannot accumulate
+}
+
+static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ x86_reg nCoeffs;
+ const uint16_t *quant_matrix;
+
+ assert(s->block_last_index[n]>=0);
+
+ if(s->alternate_scan) nCoeffs= 63; //FIXME
+ else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+
+ quant_matrix = s->inter_matrix;
+__asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlq $48, %%mm7 \n\t"
+ "movd %2, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "mov %3, %%"REG_a" \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
+ "movq (%1, %%"REG_a"), %%mm4 \n\t"
+ "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
+ "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
+ "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
+ "pxor %%mm2, %%mm2 \n\t"
+ "pxor %%mm3, %%mm3 \n\t"
+ "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
+ "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
+ "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
+ "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
+ "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
+ "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
+ "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
+ "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
+ "pxor %%mm4, %%mm4 \n\t"
+ "pxor %%mm5, %%mm5 \n\t" // FIXME slow
+ "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
+ "psrlw $4, %%mm0 \n\t"
+ "psrlw $4, %%mm1 \n\t"
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm3, %%mm1 \n\t"
+ "pandn %%mm0, %%mm4 \n\t"
+ "pandn %%mm1, %%mm5 \n\t"
+ "pxor %%mm4, %%mm7 \n\t"
+ "pxor %%mm5, %%mm7 \n\t"
+ "movq %%mm4, (%0, %%"REG_a") \n\t"
+ "movq %%mm5, 8(%0, %%"REG_a") \n\t"
+
+ "add $16, %%"REG_a" \n\t"
+ "jng 1b \n\t"
+ "movd 124(%0, %3), %%mm0 \n\t"
+ "movq %%mm7, %%mm6 \n\t"
+ "psrlq $32, %%mm7 \n\t"
+ "pxor %%mm6, %%mm7 \n\t"
+ "movq %%mm7, %%mm6 \n\t"
+ "psrlq $16, %%mm7 \n\t"
+ "pxor %%mm6, %%mm7 \n\t"
+ "pslld $31, %%mm7 \n\t"
+ "psrlq $15, %%mm7 \n\t"
+ "pxor %%mm7, %%mm0 \n\t"
+ "movd %%mm0, 124(%0, %3) \n\t"
+
+ ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
+ : "%"REG_a, "memory"
+ );
+}
+
+static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
+ const int intra= s->mb_intra;
+ int *sum= s->dct_error_sum[intra];
+ uint16_t *offset= s->dct_offset[intra];
+
+ s->dct_count[intra]++;
+
+ __asm__ volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "1: \n\t"
+ "pxor %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "movq (%0), %%mm2 \n\t"
+ "movq 8(%0), %%mm3 \n\t"
+ "pcmpgtw %%mm2, %%mm0 \n\t"
+ "pcmpgtw %%mm3, %%mm1 \n\t"
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+ "psubw %%mm0, %%mm2 \n\t"
+ "psubw %%mm1, %%mm3 \n\t"
+ "movq %%mm2, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "psubusw (%2), %%mm2 \n\t"
+ "psubusw 8(%2), %%mm3 \n\t"
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+ "psubw %%mm0, %%mm2 \n\t"
+ "psubw %%mm1, %%mm3 \n\t"
+ "movq %%mm2, (%0) \n\t"
+ "movq %%mm3, 8(%0) \n\t"
+ "movq %%mm4, %%mm2 \n\t"
+ "movq %%mm5, %%mm3 \n\t"
+ "punpcklwd %%mm7, %%mm4 \n\t"
+ "punpckhwd %%mm7, %%mm2 \n\t"
+ "punpcklwd %%mm7, %%mm5 \n\t"
+ "punpckhwd %%mm7, %%mm3 \n\t"
+ "paddd (%1), %%mm4 \n\t"
+ "paddd 8(%1), %%mm2 \n\t"
+ "paddd 16(%1), %%mm5 \n\t"
+ "paddd 24(%1), %%mm3 \n\t"
+ "movq %%mm4, (%1) \n\t"
+ "movq %%mm2, 8(%1) \n\t"
+ "movq %%mm5, 16(%1) \n\t"
+ "movq %%mm3, 24(%1) \n\t"
+ "add $16, %0 \n\t"
+ "add $32, %1 \n\t"
+ "add $16, %2 \n\t"
+ "cmp %3, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (block), "+r" (sum), "+r" (offset)
+ : "r"(block+64)
+ );
+}
+
+static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
+ const int intra= s->mb_intra;
+ int *sum= s->dct_error_sum[intra];
+ uint16_t *offset= s->dct_offset[intra];
+
+ s->dct_count[intra]++;
+
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7 \n\t"
+ "1: \n\t"
+ "pxor %%xmm0, %%xmm0 \n\t"
+ "pxor %%xmm1, %%xmm1 \n\t"
+ "movdqa (%0), %%xmm2 \n\t"
+ "movdqa 16(%0), %%xmm3 \n\t"
+ "pcmpgtw %%xmm2, %%xmm0 \n\t"
+ "pcmpgtw %%xmm3, %%xmm1 \n\t"
+ "pxor %%xmm0, %%xmm2 \n\t"
+ "pxor %%xmm1, %%xmm3 \n\t"
+ "psubw %%xmm0, %%xmm2 \n\t"
+ "psubw %%xmm1, %%xmm3 \n\t"
+ "movdqa %%xmm2, %%xmm4 \n\t"
+ "movdqa %%xmm3, %%xmm5 \n\t"
+ "psubusw (%2), %%xmm2 \n\t"
+ "psubusw 16(%2), %%xmm3 \n\t"
+ "pxor %%xmm0, %%xmm2 \n\t"
+ "pxor %%xmm1, %%xmm3 \n\t"
+ "psubw %%xmm0, %%xmm2 \n\t"
+ "psubw %%xmm1, %%xmm3 \n\t"
+ "movdqa %%xmm2, (%0) \n\t"
+ "movdqa %%xmm3, 16(%0) \n\t"
+ "movdqa %%xmm4, %%xmm6 \n\t"
+ "movdqa %%xmm5, %%xmm0 \n\t"
+ "punpcklwd %%xmm7, %%xmm4 \n\t"
+ "punpckhwd %%xmm7, %%xmm6 \n\t"
+ "punpcklwd %%xmm7, %%xmm5 \n\t"
+ "punpckhwd %%xmm7, %%xmm0 \n\t"
+ "paddd (%1), %%xmm4 \n\t"
+ "paddd 16(%1), %%xmm6 \n\t"
+ "paddd 32(%1), %%xmm5 \n\t"
+ "paddd 48(%1), %%xmm0 \n\t"
+ "movdqa %%xmm4, (%1) \n\t"
+ "movdqa %%xmm6, 16(%1) \n\t"
+ "movdqa %%xmm5, 32(%1) \n\t"
+ "movdqa %%xmm0, 48(%1) \n\t"
+ "add $32, %0 \n\t"
+ "add $64, %1 \n\t"
+ "add $32, %2 \n\t"
+ "cmp %3, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (block), "+r" (sum), "+r" (offset)
+ : "r"(block+64)
+ );
+}
+
+#ifdef HAVE_SSSE3
+#define HAVE_SSSE3_BAK
+#endif
+#undef HAVE_SSSE3
+
+#undef HAVE_SSE2
+#undef HAVE_MMX2
+#define RENAME(a) a ## _MMX
+#define RENAMEl(a) a ## _mmx
+#include "mpegvideo_mmx_template.c"
+
+#define HAVE_MMX2
+#undef RENAME
+#undef RENAMEl
+#define RENAME(a) a ## _MMX2
+#define RENAMEl(a) a ## _mmx2
+#include "mpegvideo_mmx_template.c"
+
+#define HAVE_SSE2
+#undef RENAME
+#undef RENAMEl
+#define RENAME(a) a ## _SSE2
+#define RENAMEl(a) a ## _sse2
+#include "mpegvideo_mmx_template.c"
+
+#ifdef HAVE_SSSE3_BAK
+#define HAVE_SSSE3
+#undef RENAME
+#undef RENAMEl
+#define RENAME(a) a ## _SSSE3
+#define RENAMEl(a) a ## _sse2
+#include "mpegvideo_mmx_template.c"
+#endif
+
+void MPV_common_init_mmx(MpegEncContext *s)
+{
+ if (mm_flags & FF_MM_MMX) {
+ const int dct_algo = s->avctx->dct_algo;
+
+ s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
+ s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
+ s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
+ s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
+ if(!(s->flags & CODEC_FLAG_BITEXACT))
+ s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
+ s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
+
+ if (mm_flags & FF_MM_SSE2) {
+ s->denoise_dct= denoise_dct_sse2;
+ } else {
+ s->denoise_dct= denoise_dct_mmx;
+ }
+
+ if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
+#ifdef HAVE_SSSE3
+ if(mm_flags & FF_MM_SSSE3){
+ s->dct_quantize= dct_quantize_SSSE3;
+ } else
+#endif
+ if(mm_flags & FF_MM_SSE2){
+ s->dct_quantize= dct_quantize_SSE2;
+ } else if(mm_flags & FF_MM_MMXEXT){
+ s->dct_quantize= dct_quantize_MMX2;
+ } else {
+ s->dct_quantize= dct_quantize_MMX;
+ }
+ }
+ }
+}
diff --git a/libavcodec/x86/mpegvideo_mmx_template.c b/libavcodec/x86/mpegvideo_mmx_template.c
new file mode 100644
index 0000000000..a1aae5fdd4
--- /dev/null
+++ b/libavcodec/x86/mpegvideo_mmx_template.c
@@ -0,0 +1,376 @@
+/*
+ * MPEG video MMX templates
+ *
+ * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#undef MMREG_WIDTH
+#undef MM
+#undef MOVQ
+#undef SPREADW
+#undef PMAXW
+#undef PMAX
+#undef SAVE_SIGN
+#undef RESTORE_SIGN
+
+#if defined(HAVE_SSE2)
+#define MMREG_WIDTH "16"
+#define MM "%%xmm"
+#define MOVQ "movdqa"
+#define SPREADW(a) \
+ "pshuflw $0, "a", "a" \n\t"\
+ "punpcklwd "a", "a" \n\t"
+#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
+#define PMAX(a,b) \
+ "movhlps "a", "b" \n\t"\
+ PMAXW(b, a)\
+ "pshuflw $0x0E, "a", "b" \n\t"\
+ PMAXW(b, a)\
+ "pshuflw $0x01, "a", "b" \n\t"\
+ PMAXW(b, a)
+#else
+#define MMREG_WIDTH "8"
+#define MM "%%mm"
+#define MOVQ "movq"
+#if defined(HAVE_MMX2)
+#define SPREADW(a) "pshufw $0, "a", "a" \n\t"
+#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
+#define PMAX(a,b) \
+ "pshufw $0x0E, "a", "b" \n\t"\
+ PMAXW(b, a)\
+ "pshufw $0x01, "a", "b" \n\t"\
+ PMAXW(b, a)
+#else
+#define SPREADW(a) \
+ "punpcklwd "a", "a" \n\t"\
+ "punpcklwd "a", "a" \n\t"
+#define PMAXW(a,b) \
+ "psubusw "a", "b" \n\t"\
+ "paddw "a", "b" \n\t"
+#define PMAX(a,b) \
+ "movq "a", "b" \n\t"\
+ "psrlq $32, "a" \n\t"\
+ PMAXW(b, a)\
+ "movq "a", "b" \n\t"\
+ "psrlq $16, "a" \n\t"\
+ PMAXW(b, a)
+
+#endif
+#endif
+
+#ifdef HAVE_SSSE3
+#define SAVE_SIGN(a,b) \
+ "movdqa "b", "a" \n\t"\
+ "pabsw "b", "b" \n\t"
+#define RESTORE_SIGN(a,b) \
+ "psignw "a", "b" \n\t"
+#else
+#define SAVE_SIGN(a,b) \
+ "pxor "a", "a" \n\t"\
+ "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
+ "pxor "a", "b" \n\t"\
+ "psubw "a", "b" \n\t" /* ABS(block[i]) */
+#define RESTORE_SIGN(a,b) \
+ "pxor "a", "b" \n\t"\
+ "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+#endif
+
+static int RENAME(dct_quantize)(MpegEncContext *s,
+ DCTELEM *block, int n,
+ int qscale, int *overflow)
+{
+ x86_reg last_non_zero_p1;
+ int level=0, q; //=0 is because gcc says uninitialized ...
+ const uint16_t *qmat, *bias;
+ DECLARE_ALIGNED_16(int16_t, temp_block[64]);
+
+ assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
+
+ //s->fdct (block);
+ RENAMEl(ff_fdct) (block); //cannot be anything else ...
+
+ if(s->dct_error_sum)
+ s->denoise_dct(s, block);
+
+ if (s->mb_intra) {
+ int dummy;
+ if (n < 4)
+ q = s->y_dc_scale;
+ else
+ q = s->c_dc_scale;
+ /* note: block[0] is assumed to be positive */
+ if (!s->h263_aic) {
+#if 1
+ __asm__ volatile (
+ "mul %%ecx \n\t"
+ : "=d" (level), "=a"(dummy)
+ : "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1])
+ );
+#else
+ __asm__ volatile (
+ "xorl %%edx, %%edx \n\t"
+ "divw %%cx \n\t"
+ "movzwl %%ax, %%eax \n\t"
+ : "=a" (level)
+ : "a" ((block[0]>>2) + q), "c" (q<<1)
+ : "%edx"
+ );
+#endif
+ } else
+ /* For AIC we skip quant/dequant of INTRADC */
+ level = (block[0] + 4)>>3;
+
+ block[0]=0; //avoid fake overflow
+// temp_block[0] = (block[0] + (q >> 1)) / q;
+ last_non_zero_p1 = 1;
+ bias = s->q_intra_matrix16[qscale][1];
+ qmat = s->q_intra_matrix16[qscale][0];
+ } else {
+ last_non_zero_p1 = 0;
+ bias = s->q_inter_matrix16[qscale][1];
+ qmat = s->q_inter_matrix16[qscale][0];
+ }
+
+ if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
+
+ __asm__ volatile(
+ "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
+ SPREADW(MM"3")
+ "pxor "MM"7, "MM"7 \n\t" // 0
+ "pxor "MM"4, "MM"4 \n\t" // 0
+ MOVQ" (%2), "MM"5 \n\t" // qmat[0]
+ "pxor "MM"6, "MM"6 \n\t"
+ "psubw (%3), "MM"6 \n\t" // -bias[0]
+ "mov $-128, %%"REG_a" \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
+ SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
+ "psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
+ "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
+ "por "MM"0, "MM"4 \n\t"
+ RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+ MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
+ "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
+ MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
+ MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
+ "pandn "MM"1, "MM"0 \n\t"
+ PMAXW(MM"0", MM"3")
+ "add $"MMREG_WIDTH", %%"REG_a" \n\t"
+ " js 1b \n\t"
+ PMAX(MM"3", MM"0")
+ "movd "MM"3, %%"REG_a" \n\t"
+ "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
+ : "+a" (last_non_zero_p1)
+ : "r" (block+64), "r" (qmat), "r" (bias),
+ "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
+ );
+ }else{ // FMT_H263
+ __asm__ volatile(
+ "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
+ SPREADW(MM"3")
+ "pxor "MM"7, "MM"7 \n\t" // 0
+ "pxor "MM"4, "MM"4 \n\t" // 0
+ "mov $-128, %%"REG_a" \n\t"
+ ASMALIGN(4)
+ "1: \n\t"
+ MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
+ SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
+ MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0]
+ "paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
+ MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i]
+ "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
+ "por "MM"0, "MM"4 \n\t"
+ RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
+ MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
+ "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
+ MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
+ MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
+ "pandn "MM"1, "MM"0 \n\t"
+ PMAXW(MM"0", MM"3")
+ "add $"MMREG_WIDTH", %%"REG_a" \n\t"
+ " js 1b \n\t"
+ PMAX(MM"3", MM"0")
+ "movd "MM"3, %%"REG_a" \n\t"
+ "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
+ : "+a" (last_non_zero_p1)
+ : "r" (block+64), "r" (qmat+64), "r" (bias+64),
+ "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
+ );
+ }
+ __asm__ volatile(
+ "movd %1, "MM"1 \n\t" // max_qcoeff
+ SPREADW(MM"1")
+ "psubusw "MM"1, "MM"4 \n\t"
+ "packuswb "MM"4, "MM"4 \n\t"
+#ifdef HAVE_SSE2
+ "packuswb "MM"4, "MM"4 \n\t"
+#endif
+ "movd "MM"4, %0 \n\t" // *overflow
+ : "=g" (*overflow)
+ : "g" (s->max_qcoeff)
+ );
+
+ if(s->mb_intra) block[0]= level;
+ else block[0]= temp_block[0];
+
+ if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
+ if(last_non_zero_p1 <= 1) goto end;
+ block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
+ block[0x20] = temp_block[0x10];
+ if(last_non_zero_p1 <= 4) goto end;
+ block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
+ block[0x09] = temp_block[0x03];
+ if(last_non_zero_p1 <= 7) goto end;
+ block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
+ block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
+ if(last_non_zero_p1 <= 11) goto end;
+ block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
+ block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
+ block[0x0C] = temp_block[0x05];
+ if(last_non_zero_p1 <= 16) goto end;
+ block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
+ block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
+ block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
+ block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
+ if(last_non_zero_p1 <= 24) goto end;
+ block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
+ block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
+ block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
+ block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
+ if(last_non_zero_p1 <= 32) goto end;
+ block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
+ block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
+ block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
+ block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
+ if(last_non_zero_p1 <= 40) goto end;
+ block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
+ block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
+ block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
+ block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
+ if(last_non_zero_p1 <= 48) goto end;
+ block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
+ block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
+ block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+ block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
+ if(last_non_zero_p1 <= 56) goto end;
+ block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
+ block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
+ block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
+ block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+ }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){
+ if(last_non_zero_p1 <= 1) goto end;
+ block[0x04] = temp_block[0x01];
+ block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+ if(last_non_zero_p1 <= 4) goto end;
+ block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
+ block[0x05] = temp_block[0x03];
+ if(last_non_zero_p1 <= 7) goto end;
+ block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
+ block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+ if(last_non_zero_p1 <= 11) goto end;
+ block[0x1C] = temp_block[0x19];
+ block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
+ block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
+ if(last_non_zero_p1 <= 16) goto end;
+ block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
+ block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
+ block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+ block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
+ if(last_non_zero_p1 <= 24) goto end;
+ block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
+ block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
+ block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
+ block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
+ if(last_non_zero_p1 <= 32) goto end;
+ block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
+ block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+ block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
+ block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
+ if(last_non_zero_p1 <= 40) goto end;
+ block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
+ block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+ block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
+ block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
+ if(last_non_zero_p1 <= 48) goto end;
+ block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
+ block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
+ block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+ block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
+ if(last_non_zero_p1 <= 56) goto end;
+ block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
+ block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
+ block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+ block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+ }else{
+ if(last_non_zero_p1 <= 1) goto end;
+ block[0x01] = temp_block[0x01];
+ block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
+ if(last_non_zero_p1 <= 4) goto end;
+ block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
+ block[0x03] = temp_block[0x03];
+ if(last_non_zero_p1 <= 7) goto end;
+ block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
+ block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
+ if(last_non_zero_p1 <= 11) goto end;
+ block[0x19] = temp_block[0x19];
+ block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
+ block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
+ if(last_non_zero_p1 <= 16) goto end;
+ block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
+ block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
+ block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
+ block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
+ if(last_non_zero_p1 <= 24) goto end;
+ block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
+ block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
+ block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
+ block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
+ if(last_non_zero_p1 <= 32) goto end;
+ block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
+ block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
+ block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
+ block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
+ if(last_non_zero_p1 <= 40) goto end;
+ block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
+ block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
+ block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
+ block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
+ if(last_non_zero_p1 <= 48) goto end;
+ block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
+ block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
+ block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
+ block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
+ if(last_non_zero_p1 <= 56) goto end;
+ block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
+ block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
+ block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
+ block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+ }
+ end:
+/*
+ for(i=0; i<last_non_zero_p1; i++)
+ {
+ int j= zigzag_direct_noperm[i];
+ block[block_permute_op(j)]= temp_block[j];
+ }
+*/
+
+ return last_non_zero_p1 - 1;
+}
diff --git a/libavcodec/x86/simple_idct_mmx.c b/libavcodec/x86/simple_idct_mmx.c
new file mode 100644
index 0000000000..6306fcbd44
--- /dev/null
+++ b/libavcodec/x86/simple_idct_mmx.c
@@ -0,0 +1,1294 @@
+/*
+ * Simple IDCT MMX
+ *
+ * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "libavcodec/dsputil.h"
+#include "libavcodec/simple_idct.h"
+
+/*
+23170.475006
+22725.260826
+21406.727617
+19265.545870
+16384.000000
+12872.826198
+8866.956905
+4520.335430
+*/
+#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#if 0
+#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#else
+#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
+#endif
+#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+
+#define ROW_SHIFT 11
+#define COL_SHIFT 20 // 6
+
+DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
+
+DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
+ 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
+// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
+// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
+ 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
+ // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
+// 0, 0, 0, 0,
+// 0, 0, 0, 0,
+
+ C4, C4, C4, C4,
+ C4, -C4, C4, -C4,
+
+ C2, C6, C2, C6,
+ C6, -C2, C6, -C2,
+
+ C1, C3, C1, C3,
+ C5, C7, C5, C7,
+
+ C3, -C7, C3, -C7,
+-C1, -C5, -C1, -C5,
+
+ C5, -C1, C5, -C1,
+ C7, C3, C7, C3,
+
+ C7, -C5, C7, -C5,
+ C3, -C1, C3, -C1
+};
+
+#if 0
+static void unused_var_killer(){
+ int a= wm1010 + d40000;
+ temp[0]=a;
+}
+
+static void inline idctCol (int16_t * col, int16_t *input)
+{
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+ int a0, a1, a2, a3, b0, b1, b2, b3;
+ const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+/*
+ if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
+ col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
+ col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
+ return;
+ }*/
+
+col[8*0] = input[8*0 + 0];
+col[8*1] = input[8*2 + 0];
+col[8*2] = input[8*0 + 1];
+col[8*3] = input[8*2 + 1];
+col[8*4] = input[8*4 + 0];
+col[8*5] = input[8*6 + 0];
+col[8*6] = input[8*4 + 1];
+col[8*7] = input[8*6 + 1];
+
+ a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
+ a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
+ a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
+ a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
+
+ b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
+ b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
+ b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
+ b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
+
+ col[8*0] = (a0 + b0) >> COL_SHIFT;
+ col[8*1] = (a1 + b1) >> COL_SHIFT;
+ col[8*2] = (a2 + b2) >> COL_SHIFT;
+ col[8*3] = (a3 + b3) >> COL_SHIFT;
+ col[8*4] = (a3 - b3) >> COL_SHIFT;
+ col[8*5] = (a2 - b2) >> COL_SHIFT;
+ col[8*6] = (a1 - b1) >> COL_SHIFT;
+ col[8*7] = (a0 - b0) >> COL_SHIFT;
+}
+
+static void inline idctRow (int16_t * output, int16_t * input)
+{
+ int16_t row[8];
+
+ int a0, a1, a2, a3, b0, b1, b2, b3;
+ const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+ const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+
+row[0] = input[0];
+row[2] = input[1];
+row[4] = input[4];
+row[6] = input[5];
+row[1] = input[8];
+row[3] = input[9];
+row[5] = input[12];
+row[7] = input[13];
+
+ if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
+ row[0] = row[1] = row[2] = row[3] = row[4] =
+ row[5] = row[6] = row[7] = row[0]<<3;
+ output[0] = row[0];
+ output[2] = row[1];
+ output[4] = row[2];
+ output[6] = row[3];
+ output[8] = row[4];
+ output[10] = row[5];
+ output[12] = row[6];
+ output[14] = row[7];
+ return;
+ }
+
+ a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
+ a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
+ a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
+ a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
+
+ b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+ b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+ b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+ b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+ row[0] = (a0 + b0) >> ROW_SHIFT;
+ row[1] = (a1 + b1) >> ROW_SHIFT;
+ row[2] = (a2 + b2) >> ROW_SHIFT;
+ row[3] = (a3 + b3) >> ROW_SHIFT;
+ row[4] = (a3 - b3) >> ROW_SHIFT;
+ row[5] = (a2 - b2) >> ROW_SHIFT;
+ row[6] = (a1 - b1) >> ROW_SHIFT;
+ row[7] = (a0 - b0) >> ROW_SHIFT;
+
+ output[0] = row[0];
+ output[2] = row[1];
+ output[4] = row[2];
+ output[6] = row[3];
+ output[8] = row[4];
+ output[10] = row[5];
+ output[12] = row[6];
+ output[14] = row[7];
+}
+#endif
+
+static inline void idct(int16_t *block)
+{
+ DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
+ int16_t * const temp= (int16_t*)align_tmp;
+
+ __asm__ volatile(
+#if 0 //Alternative, simpler variant
+
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ #rounder ", %%mm4 \n\t"\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ #rounder ", %%mm0 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm0, %%mm0 \n\t" \
+ "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
+ "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
+ "movq %%mm7, " #dst " \n\t"\
+ "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "movq %%mm2, 24+" #dst " \n\t"\
+ "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
+ "movq %%mm2, 8+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
+ "movq %%mm4, 16+" #dst " \n\t"\
+
+#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
+ "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
+ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm7, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm2, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm2, 32+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm4, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"\
+
+
+#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq "MANGLE(wm1010)", %%mm4 \n\t"\
+ "pand %%mm0, %%mm4 \n\t"\
+ "por %%mm1, %%mm4 \n\t"\
+ "por %%mm2, %%mm4 \n\t"\
+ "por %%mm3, %%mm4 \n\t"\
+ "packssdw %%mm4,%%mm4 \n\t"\
+ "movd %%mm4, %%eax \n\t"\
+ "orl %%eax, %%eax \n\t"\
+ "jz 1f \n\t"\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ #rounder ", %%mm4 \n\t"\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ #rounder ", %%mm0 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm0, %%mm0 \n\t" \
+ "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
+ "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
+ "movq %%mm7, " #dst " \n\t"\
+ "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "movq %%mm2, 24+" #dst " \n\t"\
+ "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
+ "movq %%mm2, 8+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
+ "movq %%mm4, 16+" #dst " \n\t"\
+ "jmp 2f \n\t"\
+ "1: \n\t"\
+ "pslld $16, %%mm0 \n\t"\
+ "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
+ "psrad $13, %%mm0 \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t"\
+ "movq %%mm0, " #dst " \n\t"\
+ "movq %%mm0, 8+" #dst " \n\t"\
+ "movq %%mm0, 16+" #dst " \n\t"\
+ "movq %%mm0, 24+" #dst " \n\t"\
+ "2: \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, rounder, shift)
+ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
+/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
+ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
+ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
+
+DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+
+#else
+
+#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq "MANGLE(wm1010)", %%mm4 \n\t"\
+ "pand %%mm0, %%mm4 \n\t"\
+ "por %%mm1, %%mm4 \n\t"\
+ "por %%mm2, %%mm4 \n\t"\
+ "por %%mm3, %%mm4 \n\t"\
+ "packssdw %%mm4,%%mm4 \n\t"\
+ "movd %%mm4, %%eax \n\t"\
+ "orl %%eax, %%eax \n\t"\
+ "jz 1f \n\t"\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ #rounder ", %%mm4 \n\t"\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ #rounder ", %%mm0 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm0, %%mm0 \n\t" \
+ "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
+ "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
+ "movq %%mm7, " #dst " \n\t"\
+ "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "movq %%mm2, 24+" #dst " \n\t"\
+ "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
+ "movq %%mm2, 8+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
+ "movq %%mm4, 16+" #dst " \n\t"\
+ "jmp 2f \n\t"\
+ "1: \n\t"\
+ "pslld $16, %%mm0 \n\t"\
+ "paddd "MANGLE(d40000)", %%mm0 \n\t"\
+ "psrad $13, %%mm0 \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t"\
+ "movq %%mm0, " #dst " \n\t"\
+ "movq %%mm0, 8+" #dst " \n\t"\
+ "movq %%mm0, 16+" #dst " \n\t"\
+ "movq %%mm0, 24+" #dst " \n\t"\
+ "2: \n\t"
+
+#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq %%mm0, %%mm4 \n\t"\
+ "por %%mm1, %%mm4 \n\t"\
+ "por %%mm2, %%mm4 \n\t"\
+ "por %%mm3, %%mm4 \n\t"\
+ "packssdw %%mm4,%%mm4 \n\t"\
+ "movd %%mm4, %%eax \n\t"\
+ "orl %%eax, %%eax \n\t"\
+ "jz " #bt " \n\t"\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ #rounder ", %%mm4 \n\t"\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ #rounder ", %%mm0 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm0, %%mm0 \n\t" \
+ "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
+ "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
+ "movq %%mm7, " #dst " \n\t"\
+ "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "movq %%mm2, 24+" #dst " \n\t"\
+ "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
+ "movq %%mm2, 8+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
+ "movq %%mm4, 16+" #dst " \n\t"\
+
+#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ #rounder ", %%mm4 \n\t"\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ #rounder ", %%mm0 \n\t"\
+ "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm0, %%mm0 \n\t" \
+ "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
+ "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
+ "movq %%mm7, " #dst " \n\t"\
+ "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "movq %%mm2, 24+" #dst " \n\t"\
+ "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
+ "movq %%mm2, 8+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
+ "movq %%mm4, 16+" #dst " \n\t"\
+
+//IDCT( src0, src4, src1, src5, dst, rounder, shift)
+DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
+Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
+Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
+Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
+ "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
+ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm7, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm2, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm2, 32+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm4, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+ "#" ASMALIGN(4) \
+ "4: \n\t"
+Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
+Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
+ "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
+ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm1, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm2, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm2, 32+" #dst " \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm1, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+ "#" ASMALIGN(4) \
+ "6: \n\t"
+Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm1, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm2, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm2, 32+" #dst " \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm1, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+ "#" ASMALIGN(4) \
+ "2: \n\t"
+Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
+
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
+ "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
+ "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
+ "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm7, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm2, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
+ "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
+ "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
+ "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
+ "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
+ "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
+ "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm2, 32+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm4, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+ "#" ASMALIGN(4) \
+ "3: \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 64(%2), %%mm3 \n\t"\
+ "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
+ "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm7, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm1, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
+ "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm1, 32+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "movd %%mm4, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+ "#" ASMALIGN(4) \
+ "5: \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
+ "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
+ "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
+ "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
+ "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
+ "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
+ "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
+ "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
+ "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
+ "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm3 \n\t"\
+ "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
+ "movq %%mm4, " #dst " \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
+ "movq %%mm0, 16+" #dst " \n\t"\
+ "movq %%mm0, 96+" #dst " \n\t"\
+ "movq %%mm4, 112+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movq %%mm5, 32+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movq %%mm6, 48+" #dst " \n\t"\
+ "movq %%mm6, 64+" #dst " \n\t"\
+ "movq %%mm5, 80+" #dst " \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+
+ "#" ASMALIGN(4) \
+ "1: \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
+ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
+ "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
+ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
+ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
+ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
+ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
+ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
+ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
+ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
+ "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
+ "movq 64(%2), %%mm1 \n\t"\
+ "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
+ "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
+ "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "psrad $" #shift ", %%mm7 \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
+ "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "psrad $" #shift ", %%mm3 \n\t"\
+ "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
+ "movd %%mm7, " #dst " \n\t"\
+ "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
+ "movd %%mm0, 16+" #dst " \n\t"\
+ "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
+ "movd %%mm3, 96+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
+ "movd %%mm4, 112+" #dst " \n\t"\
+ "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
+ "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
+ "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
+ "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
+ "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
+ "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
+ "psrad $" #shift ", %%mm3 \n\t"\
+ "psrad $" #shift ", %%mm5 \n\t"\
+ "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
+ "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
+ "psrad $" #shift ", %%mm6 \n\t"\
+ "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
+ "movd %%mm3, 32+" #dst " \n\t"\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
+ "movd %%mm6, 48+" #dst " \n\t"\
+ "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
+ "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
+ "movd %%mm4, 64+" #dst " \n\t"\
+ "movd %%mm5, 80+" #dst " \n\t"
+
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+ "jmp 9f \n\t"
+
+
+ "#" ASMALIGN(4)
+ "7: \n\t"
+#undef IDCT
+#define IDCT(src0, src4, src1, src5, dst, shift) \
+ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
+ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "psrad $" #shift ", %%mm4 \n\t"\
+ "psrad $" #shift ", %%mm0 \n\t"\
+ "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
+ "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
+ "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
+ "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
+ "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
+ "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
+ "psrad $" #shift ", %%mm1 \n\t"\
+ "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
+ "movq %%mm4, " #dst " \n\t"\
+ "psrad $" #shift ", %%mm2 \n\t"\
+ "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
+ "movq %%mm0, 16+" #dst " \n\t"\
+ "movq %%mm0, 96+" #dst " \n\t"\
+ "movq %%mm4, 112+" #dst " \n\t"\
+ "movq %%mm0, 32+" #dst " \n\t"\
+ "movq %%mm4, 48+" #dst " \n\t"\
+ "movq %%mm4, 64+" #dst " \n\t"\
+ "movq %%mm0, 80+" #dst " \n\t"
+
+//IDCT( src0, src4, src1, src5, dst, shift)
+IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
+//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
+IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
+//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+
+
+#endif
+
+/*
+Input
+ 00 40 04 44 20 60 24 64
+ 10 30 14 34 50 70 54 74
+ 01 41 03 43 21 61 23 63
+ 11 31 13 33 51 71 53 73
+ 02 42 06 46 22 62 26 66
+ 12 32 16 36 52 72 56 76
+ 05 45 07 47 25 65 27 67
+ 15 35 17 37 55 75 57 77
+
+Temp
+ 00 04 10 14 20 24 30 34
+ 40 44 50 54 60 64 70 74
+ 01 03 11 13 21 23 31 33
+ 41 43 51 53 61 63 71 73
+ 02 06 12 16 22 26 32 36
+ 42 46 52 56 62 66 72 76
+ 05 07 15 17 25 27 35 37
+ 45 47 55 57 65 67 75 77
+*/
+
+"9: \n\t"
+ :: "r" (block), "r" (temp), "r" (coeffs)
+ : "%eax"
+ );
+}
+
+void ff_simple_idct_mmx(int16_t *block)
+{
+ idct(block);
+}
+
+//FIXME merge add/put into the idct
+
+void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ idct(block);
+ put_pixels_clamped_mmx(block, dest, line_size);
+}
+void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ idct(block);
+ add_pixels_clamped_mmx(block, dest, line_size);
+}
diff --git a/libavcodec/x86/snowdsp_mmx.c b/libavcodec/x86/snowdsp_mmx.c
new file mode 100644
index 0000000000..93119787fd
--- /dev/null
+++ b/libavcodec/x86/snowdsp_mmx.c
@@ -0,0 +1,871 @@
+/*
+ * MMX and SSE2 optimized snow DSP utils
+ * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/snow.h"
+
+void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){
+ const int w2= (width+1)>>1;
+ DECLARE_ALIGNED_16(IDWTELEM, temp[width>>1]);
+ const int w_l= (width>>1);
+ const int w_r= w2 - 1;
+ int i;
+
+ { // Lift 0
+ IDWTELEM * const ref = b + w2 - 1;
+ IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
+ // (the first time erroneously), we allow the SSE2 code to run an extra pass.
+ // The savings in code and time are well worth having to store this value and
+ // calculate b[0] correctly afterwards.
+
+ i = 0;
+ __asm__ volatile(
+ "pcmpeqd %%xmm7, %%xmm7 \n\t"
+ "pcmpeqd %%xmm3, %%xmm3 \n\t"
+ "psllw $1, %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "psllw $13, %%xmm3 \n\t"
+ ::);
+ for(; i<w_l-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw %%xmm1, %%xmm2 \n\t"
+ "paddw %%xmm5, %%xmm6 \n\t"
+ "paddw %%xmm7, %%xmm2 \n\t"
+ "paddw %%xmm7, %%xmm6 \n\t"
+ "pmulhw %%xmm3, %%xmm2 \n\t"
+ "pmulhw %%xmm3, %%xmm6 \n\t"
+ "paddw (%0), %%xmm2 \n\t"
+ "paddw 16(%0), %%xmm6 \n\t"
+ "movdqa %%xmm2, (%0) \n\t"
+ "movdqa %%xmm6, 16(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+ b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+ }
+
+ { // Lift 1
+ IDWTELEM * const dst = b+w2;
+
+ i = 0;
+ for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
+ dst[i] = dst[i] - (b[i] + b[i + 1]);
+ }
+ for(; i<w_r-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm1 \n\t"
+ "movdqu 16(%1), %%xmm5 \n\t"
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw %%xmm1, %%xmm2 \n\t"
+ "paddw %%xmm5, %%xmm6 \n\t"
+ "movdqa (%0), %%xmm0 \n\t"
+ "movdqa 16(%0), %%xmm4 \n\t"
+ "psubw %%xmm2, %%xmm0 \n\t"
+ "psubw %%xmm6, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&dst[i]), "r"(&b[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+ }
+
+ { // Lift 2
+ IDWTELEM * const ref = b+w2 - 1;
+ IDWTELEM b_0 = b[0];
+
+ i = 0;
+ __asm__ volatile(
+ "psllw $15, %%xmm7 \n\t"
+ "pcmpeqw %%xmm6, %%xmm6 \n\t"
+ "psrlw $13, %%xmm6 \n\t"
+ "paddw %%xmm7, %%xmm6 \n\t"
+ ::);
+ for(; i<w_l-15; i+=16){
+ __asm__ volatile(
+ "movdqu (%1), %%xmm0 \n\t"
+ "movdqu 16(%1), %%xmm4 \n\t"
+ "movdqu 2(%1), %%xmm1 \n\t"
+ "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
+ "paddw %%xmm6, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm4 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm5 \n\t"
+ "pavgw %%xmm1, %%xmm0 \n\t"
+ "pavgw %%xmm5, %%xmm4 \n\t"
+ "psubw %%xmm7, %%xmm0 \n\t"
+ "psubw %%xmm7, %%xmm4 \n\t"
+ "psraw $1, %%xmm0 \n\t"
+ "psraw $1, %%xmm4 \n\t"
+ "movdqa (%0), %%xmm1 \n\t"
+ "movdqa 16(%0), %%xmm5 \n\t"
+ "paddw %%xmm1, %%xmm0 \n\t"
+ "paddw %%xmm5, %%xmm4 \n\t"
+ "psraw $2, %%xmm0 \n\t"
+ "psraw $2, %%xmm4 \n\t"
+ "paddw %%xmm1, %%xmm0 \n\t"
+ "paddw %%xmm5, %%xmm4 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm4, 16(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+ b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
+ }
+
+ { // Lift 3
+ IDWTELEM * const src = b+w2;
+
+ i = 0;
+ for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
+ temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
+ }
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movdqu 2(%1), %%xmm2 \n\t"
+ "movdqu 18(%1), %%xmm6 \n\t"
+ "paddw (%1), %%xmm2 \n\t"
+ "paddw 16(%1), %%xmm6 \n\t"
+ "movdqu (%0), %%xmm0 \n\t"
+ "movdqu 16(%0), %%xmm4 \n\t"
+ "paddw %%xmm2, %%xmm0 \n\t"
+ "paddw %%xmm6, %%xmm4 \n\t"
+ "psraw $1, %%xmm2 \n\t"
+ "psraw $1, %%xmm6 \n\t"
+ "paddw %%xmm0, %%xmm2 \n\t"
+ "paddw %%xmm4, %%xmm6 \n\t"
+ "movdqa %%xmm2, (%2) \n\t"
+ "movdqa %%xmm6, 16(%2) \n\t"
+ :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+ }
+
+ {
+ snow_interleave_line_header(&i, width, b, temp);
+
+ for (; (i & 0x3E) != 0x3E; i-=2){
+ b[i+1] = temp[i>>1];
+ b[i] = b[i>>1];
+ }
+ for (i-=62; i>=0; i-=64){
+ __asm__ volatile(
+ "movdqa (%1), %%xmm0 \n\t"
+ "movdqa 16(%1), %%xmm2 \n\t"
+ "movdqa 32(%1), %%xmm4 \n\t"
+ "movdqa 48(%1), %%xmm6 \n\t"
+ "movdqa (%1), %%xmm1 \n\t"
+ "movdqa 16(%1), %%xmm3 \n\t"
+ "movdqa 32(%1), %%xmm5 \n\t"
+ "movdqa 48(%1), %%xmm7 \n\t"
+ "punpcklwd (%2), %%xmm0 \n\t"
+ "punpcklwd 16(%2), %%xmm2 \n\t"
+ "punpcklwd 32(%2), %%xmm4 \n\t"
+ "punpcklwd 48(%2), %%xmm6 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ "movdqa %%xmm2, 32(%0) \n\t"
+ "movdqa %%xmm4, 64(%0) \n\t"
+ "movdqa %%xmm6, 96(%0) \n\t"
+ "punpckhwd (%2), %%xmm1 \n\t"
+ "punpckhwd 16(%2), %%xmm3 \n\t"
+ "punpckhwd 32(%2), %%xmm5 \n\t"
+ "punpckhwd 48(%2), %%xmm7 \n\t"
+ "movdqa %%xmm1, 16(%0) \n\t"
+ "movdqa %%xmm3, 48(%0) \n\t"
+ "movdqa %%xmm5, 80(%0) \n\t"
+ "movdqa %%xmm7, 112(%0) \n\t"
+ :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
+ : "memory"
+ );
+ }
+ }
+}
+
+void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width){
+ const int w2= (width+1)>>1;
+ IDWTELEM temp[width >> 1];
+ const int w_l= (width>>1);
+ const int w_r= w2 - 1;
+ int i;
+
+ { // Lift 0
+ IDWTELEM * const ref = b + w2 - 1;
+
+ i = 1;
+ b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "pcmpeqw %%mm3, %%mm3 \n\t"
+ "psllw $1, %%mm3 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "psllw $13, %%mm3 \n\t"
+ ::);
+ for(; i<w_l-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm2 \n\t"
+ "movq 8(%1), %%mm6 \n\t"
+ "paddw 2(%1), %%mm2 \n\t"
+ "paddw 10(%1), %%mm6 \n\t"
+ "paddw %%mm7, %%mm2 \n\t"
+ "paddw %%mm7, %%mm6 \n\t"
+ "pmulhw %%mm3, %%mm2 \n\t"
+ "pmulhw %%mm3, %%mm6 \n\t"
+ "paddw (%0), %%mm2 \n\t"
+ "paddw 8(%0), %%mm6 \n\t"
+ "movq %%mm2, (%0) \n\t"
+ "movq %%mm6, 8(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
+ }
+
+ { // Lift 1
+ IDWTELEM * const dst = b+w2;
+
+ i = 0;
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm2 \n\t"
+ "movq 8(%1), %%mm6 \n\t"
+ "paddw 2(%1), %%mm2 \n\t"
+ "paddw 10(%1), %%mm6 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "psubw %%mm2, %%mm0 \n\t"
+ "psubw %%mm6, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&dst[i]), "r"(&b[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
+ }
+
+ { // Lift 2
+ IDWTELEM * const ref = b+w2 - 1;
+
+ i = 1;
+ b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
+ __asm__ volatile(
+ "psllw $15, %%mm7 \n\t"
+ "pcmpeqw %%mm6, %%mm6 \n\t"
+ "psrlw $13, %%mm6 \n\t"
+ "paddw %%mm7, %%mm6 \n\t"
+ ::);
+ for(; i<w_l-7; i+=8){
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm4 \n\t"
+ "movq 2(%1), %%mm1 \n\t"
+ "movq 10(%1), %%mm5 \n\t"
+ "paddw %%mm6, %%mm0 \n\t"
+ "paddw %%mm6, %%mm4 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ "pavgw %%mm1, %%mm0 \n\t"
+ "pavgw %%mm5, %%mm4 \n\t"
+ "psubw %%mm7, %%mm0 \n\t"
+ "psubw %%mm7, %%mm4 \n\t"
+ "psraw $1, %%mm0 \n\t"
+ "psraw $1, %%mm4 \n\t"
+ "movq (%0), %%mm1 \n\t"
+ "movq 8(%0), %%mm5 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm5, %%mm4 \n\t"
+ "psraw $2, %%mm0 \n\t"
+ "psraw $2, %%mm4 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm5, %%mm4 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ :: "r"(&b[i]), "r"(&ref[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
+ }
+
+ { // Lift 3
+ IDWTELEM * const src = b+w2;
+ i = 0;
+
+ for(; i<w_r-7; i+=8){
+ __asm__ volatile(
+ "movq 2(%1), %%mm2 \n\t"
+ "movq 10(%1), %%mm6 \n\t"
+ "paddw (%1), %%mm2 \n\t"
+ "paddw 8(%1), %%mm6 \n\t"
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "paddw %%mm6, %%mm4 \n\t"
+ "psraw $1, %%mm2 \n\t"
+ "psraw $1, %%mm6 \n\t"
+ "paddw %%mm0, %%mm2 \n\t"
+ "paddw %%mm4, %%mm6 \n\t"
+ "movq %%mm2, (%2) \n\t"
+ "movq %%mm6, 8(%2) \n\t"
+ :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
+ : "memory"
+ );
+ }
+ snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
+ }
+
+ {
+ snow_interleave_line_header(&i, width, b, temp);
+
+ for (; (i & 0x1E) != 0x1E; i-=2){
+ b[i+1] = temp[i>>1];
+ b[i] = b[i>>1];
+ }
+ for (i-=30; i>=0; i-=32){
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 8(%1), %%mm2 \n\t"
+ "movq 16(%1), %%mm4 \n\t"
+ "movq 24(%1), %%mm6 \n\t"
+ "movq (%1), %%mm1 \n\t"
+ "movq 8(%1), %%mm3 \n\t"
+ "movq 16(%1), %%mm5 \n\t"
+ "movq 24(%1), %%mm7 \n\t"
+ "punpcklwd (%2), %%mm0 \n\t"
+ "punpcklwd 8(%2), %%mm2 \n\t"
+ "punpcklwd 16(%2), %%mm4 \n\t"
+ "punpcklwd 24(%2), %%mm6 \n\t"
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm2, 16(%0) \n\t"
+ "movq %%mm4, 32(%0) \n\t"
+ "movq %%mm6, 48(%0) \n\t"
+ "punpckhwd (%2), %%mm1 \n\t"
+ "punpckhwd 8(%2), %%mm3 \n\t"
+ "punpckhwd 16(%2), %%mm5 \n\t"
+ "punpckhwd 24(%2), %%mm7 \n\t"
+ "movq %%mm1, 8(%0) \n\t"
+ "movq %%mm3, 24(%0) \n\t"
+ "movq %%mm5, 40(%0) \n\t"
+ "movq %%mm7, 56(%0) \n\t"
+ :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
+ : "memory"
+ );
+ }
+ }
+}
+
+#ifdef HAVE_7REGS
+#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
+ ""op" ("r",%%"REG_d"), %%"t0" \n\t"\
+ ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\
+ ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\
+ ""op" 48("r",%%"REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
+ snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
+ snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "psubw %%"s0", %%"t0" \n\t"\
+ "psubw %%"s1", %%"t1" \n\t"\
+ "psubw %%"s2", %%"t2" \n\t"\
+ "psubw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
+ "movdqa %%"s0", ("w",%%"REG_d") \n\t"\
+ "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\
+ "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\
+ "movdqa %%"s3", 48("w",%%"REG_d") \n\t"
+
+#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
+ "psraw $"n", %%"t0" \n\t"\
+ "psraw $"n", %%"t1" \n\t"\
+ "psraw $"n", %%"t2" \n\t"\
+ "psraw $"n", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "paddw %%"s0", %%"t0" \n\t"\
+ "paddw %%"s1", %%"t1" \n\t"\
+ "paddw %%"s2", %%"t2" \n\t"\
+ "paddw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "pmulhw %%"s0", %%"t0" \n\t"\
+ "pmulhw %%"s1", %%"t1" \n\t"\
+ "pmulhw %%"s2", %%"t2" \n\t"\
+ "pmulhw %%"s3", %%"t3" \n\t"
+
+#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "movdqa %%"s0", %%"t0" \n\t"\
+ "movdqa %%"s1", %%"t1" \n\t"\
+ "movdqa %%"s2", %%"t2" \n\t"\
+ "movdqa %%"s3", %%"t3" \n\t"
+
+void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+ x86_reg i = width;
+
+ while(i & 0x1F)
+ {
+ i--;
+ b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+ b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+ b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+ b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+ }
+ i+=i;
+
+ __asm__ volatile (
+ "jmp 2f \n\t"
+ "1: \n\t"
+ snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
+
+
+ "pcmpeqw %%xmm0, %%xmm0 \n\t"
+ "pcmpeqw %%xmm2, %%xmm2 \n\t"
+ "paddw %%xmm2, %%xmm2 \n\t"
+ "paddw %%xmm0, %%xmm2 \n\t"
+ "psllw $13, %%xmm2 \n\t"
+ snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
+
+ "pcmpeqw %%xmm7, %%xmm7 \n\t"
+ "pcmpeqw %%xmm5, %%xmm5 \n\t"
+ "psllw $15, %%xmm7 \n\t"
+ "psrlw $13, %%xmm5 \n\t"
+ "paddw %%xmm7, %%xmm5 \n\t"
+ snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
+ "movq (%2,%%"REG_d"), %%xmm1 \n\t"
+ "movq 8(%2,%%"REG_d"), %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "pavgw %%xmm1, %%xmm0 \n\t"
+ "pavgw %%xmm3, %%xmm2 \n\t"
+ "movq 16(%2,%%"REG_d"), %%xmm1 \n\t"
+ "movq 24(%2,%%"REG_d"), %%xmm3 \n\t"
+ "paddw %%xmm7, %%xmm1 \n\t"
+ "paddw %%xmm7, %%xmm3 \n\t"
+ "pavgw %%xmm1, %%xmm4 \n\t"
+ "pavgw %%xmm3, %%xmm6 \n\t"
+ snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+
+ snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
+ snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
+ snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
+
+ "2: \n\t"
+ "sub $64, %%"REG_d" \n\t"
+ "jge 1b \n\t"
+ :"+d"(i)
+ :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+
+#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
+ ""op" ("r",%%"REG_d"), %%"t0" \n\t"\
+ ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\
+ ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
+ ""op" 24("r",%%"REG_d"), %%"t3" \n\t"
+
+#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
+ snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
+ snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
+
+#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
+ "movq %%"s0", ("w",%%"REG_d") \n\t"\
+ "movq %%"s1", 8("w",%%"REG_d") \n\t"\
+ "movq %%"s2", 16("w",%%"REG_d") \n\t"\
+ "movq %%"s3", 24("w",%%"REG_d") \n\t"
+
+#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
+ "movq %%"s0", %%"t0" \n\t"\
+ "movq %%"s1", %%"t1" \n\t"\
+ "movq %%"s2", %%"t2" \n\t"\
+ "movq %%"s3", %%"t3" \n\t"
+
+
+void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
+ x86_reg i = width;
+ while(i & 15)
+ {
+ i--;
+ b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
+ b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
+ b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
+ b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
+ }
+ i+=i;
+ __asm__ volatile(
+ "jmp 2f \n\t"
+ "1: \n\t"
+
+ snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
+ "pcmpeqw %%mm0, %%mm0 \n\t"
+ "pcmpeqw %%mm2, %%mm2 \n\t"
+ "paddw %%mm2, %%mm2 \n\t"
+ "paddw %%mm0, %%mm2 \n\t"
+ "psllw $13, %%mm2 \n\t"
+ snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "pcmpeqw %%mm5, %%mm5 \n\t"
+ "psllw $15, %%mm7 \n\t"
+ "psrlw $13, %%mm5 \n\t"
+ "paddw %%mm7, %%mm5 \n\t"
+ snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
+ "movq (%2,%%"REG_d"), %%mm1 \n\t"
+ "movq 8(%2,%%"REG_d"), %%mm3 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "pavgw %%mm1, %%mm0 \n\t"
+ "pavgw %%mm3, %%mm2 \n\t"
+ "movq 16(%2,%%"REG_d"), %%mm1 \n\t"
+ "movq 24(%2,%%"REG_d"), %%mm3 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+ "paddw %%mm7, %%mm3 \n\t"
+ "pavgw %%mm1, %%mm4 \n\t"
+ "pavgw %%mm3, %%mm6 \n\t"
+ snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+
+ snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
+ snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
+ snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
+
+ "2: \n\t"
+ "sub $32, %%"REG_d" \n\t"
+ "jge 1b \n\t"
+ :"+d"(i)
+ :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
+}
+#endif //HAVE_7REGS
+
+#define snow_inner_add_yblock_sse2_header \
+ IDWTELEM * * dst_array = sb->line + src_y;\
+ x86_reg tmp;\
+ __asm__ volatile(\
+ "mov %7, %%"REG_c" \n\t"\
+ "mov %6, %2 \n\t"\
+ "mov %4, %%"REG_S" \n\t"\
+ "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
+ "pcmpeqd %%xmm3, %%xmm3 \n\t"\
+ "psllw $15, %%xmm3 \n\t"\
+ "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
+ "1: \n\t"\
+ "mov %1, %%"REG_D" \n\t"\
+ "mov (%%"REG_D"), %%"REG_D" \n\t"\
+ "add %3, %%"REG_D" \n\t"
+
+#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
+ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+ "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+ "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+ "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+ "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm4 \n\t"\
+ "pmullw %%xmm0, %%"out_reg1" \n\t"\
+ "pmullw %%xmm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
+ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+ "movq (%%"REG_d"), %%"out_reg1" \n\t"\
+ "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
+ "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
+ "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm4 \n\t"\
+ "pmullw %%xmm0, %%"out_reg1" \n\t"\
+ "pmullw %%xmm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
+ snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
+ "paddusw %%xmm2, %%xmm1 \n\t"\
+ "paddusw %%xmm6, %%xmm5 \n\t"
+
+#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
+ snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
+ "paddusw %%xmm2, %%xmm1 \n\t"\
+ "paddusw %%xmm6, %%xmm5 \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common1\
+ "add $32, %%"REG_S" \n\t"\
+ "add %%"REG_c", %0 \n\t"\
+ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+ "add %%"REG_c", (%%"REG_a") \n\t"
+
+#define snow_inner_add_yblock_sse2_end_common2\
+ "jnz 1b \n\t"\
+ :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+ :\
+ "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"((x86_reg)b_h),"m"((x86_reg)src_stride):\
+ "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+#define snow_inner_add_yblock_sse2_end_8\
+ "sal $1, %%"REG_c" \n\t"\
+ "add $"PTR_SIZE"*2, %1 \n\t"\
+ snow_inner_add_yblock_sse2_end_common1\
+ "sar $1, %%"REG_c" \n\t"\
+ "sub $2, %2 \n\t"\
+ snow_inner_add_yblock_sse2_end_common2
+
+#define snow_inner_add_yblock_sse2_end_16\
+ "add $"PTR_SIZE"*1, %1 \n\t"\
+ snow_inner_add_yblock_sse2_end_common1\
+ "dec %2 \n\t"\
+ snow_inner_add_yblock_sse2_end_common2
+
+static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_8("2", "8")
+snow_inner_add_yblock_sse2_accum_8("1", "128")
+snow_inner_add_yblock_sse2_accum_8("0", "136")
+
+ "mov %0, %%"REG_d" \n\t"
+ "movdqa (%%"REG_D"), %%xmm0 \n\t"
+ "movdqa %%xmm1, %%xmm2 \n\t"
+
+ "punpckhwd %%xmm7, %%xmm1 \n\t"
+ "punpcklwd %%xmm7, %%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm0 \n\t"
+ "movdqa 16(%%"REG_D"), %%xmm2 \n\t"
+ "paddd %%xmm1, %%xmm2 \n\t"
+ "paddd %%xmm3, %%xmm0 \n\t"
+ "paddd %%xmm3, %%xmm2 \n\t"
+
+ "mov %1, %%"REG_D" \n\t"
+ "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
+ "add %3, %%"REG_D" \n\t"
+
+ "movdqa (%%"REG_D"), %%xmm4 \n\t"
+ "movdqa %%xmm5, %%xmm6 \n\t"
+ "punpckhwd %%xmm7, %%xmm5 \n\t"
+ "punpcklwd %%xmm7, %%xmm6 \n\t"
+ "paddd %%xmm6, %%xmm4 \n\t"
+ "movdqa 16(%%"REG_D"), %%xmm6 \n\t"
+ "paddd %%xmm5, %%xmm6 \n\t"
+ "paddd %%xmm3, %%xmm4 \n\t"
+ "paddd %%xmm3, %%xmm6 \n\t"
+
+ "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
+ "packssdw %%xmm2, %%xmm0 \n\t"
+ "packuswb %%xmm7, %%xmm0 \n\t"
+ "movq %%xmm0, (%%"REG_d") \n\t"
+
+ "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
+ "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
+ "packssdw %%xmm6, %%xmm4 \n\t"
+ "packuswb %%xmm7, %%xmm4 \n\t"
+ "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
+snow_inner_add_yblock_sse2_end_8
+}
+
+static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_sse2_header
+snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
+snow_inner_add_yblock_sse2_accum_16("2", "16")
+snow_inner_add_yblock_sse2_accum_16("1", "512")
+snow_inner_add_yblock_sse2_accum_16("0", "528")
+
+ "mov %0, %%"REG_d" \n\t"
+ "psrlw $4, %%xmm1 \n\t"
+ "psrlw $4, %%xmm5 \n\t"
+ "paddw (%%"REG_D"), %%xmm1 \n\t"
+ "paddw 16(%%"REG_D"), %%xmm5 \n\t"
+ "paddw %%xmm3, %%xmm1 \n\t"
+ "paddw %%xmm3, %%xmm5 \n\t"
+ "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
+ "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
+ "packuswb %%xmm5, %%xmm1 \n\t"
+
+ "movdqu %%xmm1, (%%"REG_d") \n\t"
+
+snow_inner_add_yblock_sse2_end_16
+}
+
+#define snow_inner_add_yblock_mmx_header \
+ IDWTELEM * * dst_array = sb->line + src_y;\
+ x86_reg tmp;\
+ __asm__ volatile(\
+ "mov %7, %%"REG_c" \n\t"\
+ "mov %6, %2 \n\t"\
+ "mov %4, %%"REG_S" \n\t"\
+ "pxor %%mm7, %%mm7 \n\t" /* 0 */\
+ "pcmpeqd %%mm3, %%mm3 \n\t"\
+ "psllw $15, %%mm3 \n\t"\
+ "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
+ "1: \n\t"\
+ "mov %1, %%"REG_D" \n\t"\
+ "mov (%%"REG_D"), %%"REG_D" \n\t"\
+ "add %3, %%"REG_D" \n\t"
+
+#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
+ "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
+ "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
+ "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
+ "punpcklbw %%mm7, %%"out_reg1" \n\t"\
+ "punpcklbw %%mm7, %%"out_reg2" \n\t"\
+ "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
+ "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "pmullw %%mm0, %%"out_reg1" \n\t"\
+ "pmullw %%mm4, %%"out_reg2" \n\t"
+
+#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
+ snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
+ "paddusw %%mm2, %%mm1 \n\t"\
+ "paddusw %%mm6, %%mm5 \n\t"
+
+#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
+ "mov %0, %%"REG_d" \n\t"\
+ "psrlw $4, %%mm1 \n\t"\
+ "psrlw $4, %%mm5 \n\t"\
+ "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
+ "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "paddw %%mm3, %%mm5 \n\t"\
+ "psraw $4, %%mm1 \n\t"\
+ "psraw $4, %%mm5 \n\t"\
+ "packuswb %%mm5, %%mm1 \n\t"\
+ "movq %%mm1, "write_offset"(%%"REG_d") \n\t"
+
+#define snow_inner_add_yblock_mmx_end(s_step)\
+ "add $"s_step", %%"REG_S" \n\t"\
+ "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
+ "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
+ "add %%"REG_c", (%%"REG_a") \n\t"\
+ "add $"PTR_SIZE"*1, %1 \n\t"\
+ "add %%"REG_c", %0 \n\t"\
+ "dec %2 \n\t"\
+ "jnz 1b \n\t"\
+ :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
+ :\
+ "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"((x86_reg)b_h),"m"((x86_reg)src_stride):\
+ "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
+
+static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "8", "0")
+snow_inner_add_yblock_mmx_accum("1", "128", "0")
+snow_inner_add_yblock_mmx_accum("0", "136", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+snow_inner_add_yblock_mmx_end("16")
+}
+
+static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
+ int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+snow_inner_add_yblock_mmx_header
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
+snow_inner_add_yblock_mmx_accum("2", "16", "0")
+snow_inner_add_yblock_mmx_accum("1", "512", "0")
+snow_inner_add_yblock_mmx_accum("0", "528", "0")
+snow_inner_add_yblock_mmx_mix("0", "0")
+
+snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
+snow_inner_add_yblock_mmx_accum("2", "24", "8")
+snow_inner_add_yblock_mmx_accum("1", "520", "8")
+snow_inner_add_yblock_mmx_accum("0", "536", "8")
+snow_inner_add_yblock_mmx_mix("16", "8")
+snow_inner_add_yblock_mmx_end("32")
+}
+
+void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else if (b_w == 8 && obmc_stride == 16) {
+ if (!(b_h & 1))
+ inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else
+ inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ } else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
+
+void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
+ int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
+ if (b_w == 16)
+ inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else if (b_w == 8 && obmc_stride == 16)
+ inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+ else
+ ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
+}
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
new file mode 100644
index 0000000000..b5db0ed9e4
--- /dev/null
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -0,0 +1,490 @@
+/*
+ * VC-1 and WMV3 - DSP functions MMX-optimized
+ * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+#include "dsputil_mmx.h"
+
+/** Add rounder from mm7 to mm3 and pack result at destination */
+#define NORMALIZE_MMX(SHIFT) \
+ "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
+ "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
+ "psraw "SHIFT", %%mm3 \n\t" \
+ "psraw "SHIFT", %%mm4 \n\t"
+
+#define TRANSFER_DO_PACK \
+ "packuswb %%mm4, %%mm3 \n\t" \
+ "movq %%mm3, (%2) \n\t"
+
+#define TRANSFER_DONT_PACK \
+ "movq %%mm3, 0(%2) \n\t" \
+ "movq %%mm4, 8(%2) \n\t"
+
+/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
+#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
+#define DONT_UNPACK(reg)
+
+/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
+#define LOAD_ROUNDER_MMX(ROUND) \
+ "movd "ROUND", %%mm7 \n\t" \
+ "punpcklwd %%mm7, %%mm7 \n\t" \
+ "punpckldq %%mm7, %%mm7 \n\t"
+
+#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
+ "paddw %%mm"#R2", %%mm"#R1" \n\t" \
+ "movd (%0,%3), %%mm"#R0" \n\t" \
+ "pmullw %%mm6, %%mm"#R1" \n\t" \
+ "punpcklbw %%mm0, %%mm"#R0" \n\t" \
+ "movd (%0,%2), %%mm"#R3" \n\t" \
+ "psubw %%mm"#R0", %%mm"#R1" \n\t" \
+ "punpcklbw %%mm0, %%mm"#R3" \n\t" \
+ "paddw %%mm7, %%mm"#R1" \n\t" \
+ "psubw %%mm"#R3", %%mm"#R1" \n\t" \
+ "psraw %4, %%mm"#R1" \n\t" \
+ "movq %%mm"#R1", "#OFF"(%1) \n\t" \
+ "add %2, %0 \n\t"
+
+DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL;
+
+/** Sacrifying mm6 allows to pipeline loads from src */
+static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
+ const uint8_t *src, x86_reg stride,
+ int rnd, int64_t shift)
+{
+ __asm__ volatile(
+ "mov $3, %%"REG_c" \n\t"
+ LOAD_ROUNDER_MMX("%5")
+ "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
+ "1: \n\t"
+ "movd (%0), %%mm2 \n\t"
+ "add %2, %0 \n\t"
+ "movd (%0), %%mm3 \n\t"
+ "punpcklbw %%mm0, %%mm2 \n\t"
+ "punpcklbw %%mm0, %%mm3 \n\t"
+ SHIFT2_LINE( 0, 1, 2, 3, 4)
+ SHIFT2_LINE( 24, 2, 3, 4, 1)
+ SHIFT2_LINE( 48, 3, 4, 1, 2)
+ SHIFT2_LINE( 72, 4, 1, 2, 3)
+ SHIFT2_LINE( 96, 1, 2, 3, 4)
+ SHIFT2_LINE(120, 2, 3, 4, 1)
+ SHIFT2_LINE(144, 3, 4, 1, 2)
+ SHIFT2_LINE(168, 4, 1, 2, 3)
+ "sub %6, %0 \n\t"
+ "add $8, %1 \n\t"
+ "dec %%"REG_c" \n\t"
+ "jnz 1b \n\t"
+ : "+r"(src), "+r"(dst)
+ : "r"(stride), "r"(-2*stride),
+ "m"(shift), "m"(rnd), "r"(9*stride-4)
+ : "%"REG_c, "memory"
+ );
+}
+
+/**
+ * Data is already unpacked, so some operations can directly be made from
+ * memory.
+ */
+static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+ const int16_t *src, int rnd)
+{
+ int h = 8;
+
+ src -= 1;
+ rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */
+ __asm__ volatile(
+ LOAD_ROUNDER_MMX("%4")
+ "movq "MANGLE(ff_pw_128)", %%mm6\n\t"
+ "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"
+ "1: \n\t"
+ "movq 2*0+0(%1), %%mm1 \n\t"
+ "movq 2*0+8(%1), %%mm2 \n\t"
+ "movq 2*1+0(%1), %%mm3 \n\t"
+ "movq 2*1+8(%1), %%mm4 \n\t"
+ "paddw 2*3+0(%1), %%mm1 \n\t"
+ "paddw 2*3+8(%1), %%mm2 \n\t"
+ "paddw 2*2+0(%1), %%mm3 \n\t"
+ "paddw 2*2+8(%1), %%mm4 \n\t"
+ "pmullw %%mm5, %%mm3 \n\t"
+ "pmullw %%mm5, %%mm4 \n\t"
+ "psubw %%mm1, %%mm3 \n\t"
+ "psubw %%mm2, %%mm4 \n\t"
+ NORMALIZE_MMX("$7")
+ /* Remove bias */
+ "paddw %%mm6, %%mm3 \n\t"
+ "paddw %%mm6, %%mm4 \n\t"
+ TRANSFER_DO_PACK
+ "add $24, %1 \n\t"
+ "add %3, %2 \n\t"
+ "decl %0 \n\t"
+ "jnz 1b \n\t"
+ : "+r"(h), "+r" (src), "+r" (dst)
+ : "r"(stride), "m"(rnd)
+ : "memory"
+ );
+}
+
+
+/**
+ * Purely vertical or horizontal 1/2 shift interpolation.
+ * Sacrify mm6 for *9 factor.
+ */
+static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src,
+ x86_reg stride, int rnd, x86_reg offset)
+{
+ rnd = 8-rnd;
+ __asm__ volatile(
+ "mov $8, %%"REG_c" \n\t"
+ LOAD_ROUNDER_MMX("%5")
+ "movq "MANGLE(ff_pw_9)", %%mm6\n\t"
+ "1: \n\t"
+ "movd 0(%0 ), %%mm3 \n\t"
+ "movd 4(%0 ), %%mm4 \n\t"
+ "movd 0(%0,%2), %%mm1 \n\t"
+ "movd 4(%0,%2), %%mm2 \n\t"
+ "add %2, %0 \n\t"
+ "punpcklbw %%mm0, %%mm3 \n\t"
+ "punpcklbw %%mm0, %%mm4 \n\t"
+ "punpcklbw %%mm0, %%mm1 \n\t"
+ "punpcklbw %%mm0, %%mm2 \n\t"
+ "paddw %%mm1, %%mm3 \n\t"
+ "paddw %%mm2, %%mm4 \n\t"
+ "movd 0(%0,%3), %%mm1 \n\t"
+ "movd 4(%0,%3), %%mm2 \n\t"
+ "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/
+ "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/
+ "punpcklbw %%mm0, %%mm1 \n\t"
+ "punpcklbw %%mm0, %%mm2 \n\t"
+ "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/
+ "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/
+ "movd 0(%0,%2), %%mm1 \n\t"
+ "movd 4(%0,%2), %%mm2 \n\t"
+ "punpcklbw %%mm0, %%mm1 \n\t"
+ "punpcklbw %%mm0, %%mm2 \n\t"
+ "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/
+ "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/
+ NORMALIZE_MMX("$4")
+ "packuswb %%mm4, %%mm3 \n\t"
+ "movq %%mm3, (%1) \n\t"
+ "add %6, %0 \n\t"
+ "add %4, %1 \n\t"
+ "dec %%"REG_c" \n\t"
+ "jnz 1b \n\t"
+ : "+r"(src), "+r"(dst)
+ : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),
+ "g"(stride-offset)
+ : "%"REG_c, "memory"
+ );
+}
+
+/**
+ * Filter coefficients made global to allow access by all 1 or 3 quarter shift
+ * interpolation functions.
+ */
+DECLARE_ASM_CONST(16, uint64_t, ff_pw_53) = 0x0035003500350035ULL;
+DECLARE_ASM_CONST(16, uint64_t, ff_pw_18) = 0x0012001200120012ULL;
+
+/**
+ * Core of the 1/4 and 3/4 shift bicubic interpolation.
+ *
+ * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
+ * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
+ * @param A1 Address of 1st tap (beware of unpacked/packed).
+ * @param A2 Address of 2nd tap
+ * @param A3 Address of 3rd tap
+ * @param A4 Address of 4th tap
+ */
+#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
+ MOVQ "*0+"A1", %%mm1 \n\t" \
+ MOVQ "*4+"A1", %%mm2 \n\t" \
+ UNPACK("%%mm1") \
+ UNPACK("%%mm2") \
+ "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
+ "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
+ MOVQ "*0+"A2", %%mm3 \n\t" \
+ MOVQ "*4+"A2", %%mm4 \n\t" \
+ UNPACK("%%mm3") \
+ UNPACK("%%mm4") \
+ "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
+ "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
+ "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
+ "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
+ MOVQ "*0+"A4", %%mm1 \n\t" \
+ MOVQ "*4+"A4", %%mm2 \n\t" \
+ UNPACK("%%mm1") \
+ UNPACK("%%mm2") \
+ "psllw $2, %%mm1 \n\t" /* 4* */ \
+ "psllw $2, %%mm2 \n\t" /* 4* */ \
+ "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
+ "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
+ MOVQ "*0+"A3", %%mm1 \n\t" \
+ MOVQ "*4+"A3", %%mm2 \n\t" \
+ UNPACK("%%mm1") \
+ UNPACK("%%mm2") \
+ "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
+ "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
+ "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
+ "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
+
+/**
+ * Macro to build the vertical 16bits version of vc1_put_shift[13].
+ * Here, offset=src_stride. Parameters passed A1 to A4 must use
+ * %3 (src_stride) and %4 (3*src_stride).
+ *
+ * @param NAME Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
+static void \
+vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
+ x86_reg src_stride, \
+ int rnd, int64_t shift) \
+{ \
+ int h = 8; \
+ src -= src_stride; \
+ __asm__ volatile( \
+ LOAD_ROUNDER_MMX("%5") \
+ "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
+ "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
+ ASMALIGN(3) \
+ "1: \n\t" \
+ MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
+ NORMALIZE_MMX("%6") \
+ TRANSFER_DONT_PACK \
+ /* Last 3 (in fact 4) bytes on the line */ \
+ "movd 8+"A1", %%mm1 \n\t" \
+ DO_UNPACK("%%mm1") \
+ "movq %%mm1, %%mm3 \n\t" \
+ "paddw %%mm1, %%mm1 \n\t" \
+ "paddw %%mm3, %%mm1 \n\t" /* 3* */ \
+ "movd 8+"A2", %%mm3 \n\t" \
+ DO_UNPACK("%%mm3") \
+ "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
+ "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
+ "movd 8+"A3", %%mm1 \n\t" \
+ DO_UNPACK("%%mm1") \
+ "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
+ "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
+ "movd 8+"A4", %%mm1 \n\t" \
+ DO_UNPACK("%%mm1") \
+ "psllw $2, %%mm1 \n\t" /* 4* */ \
+ "psubw %%mm1, %%mm3 \n\t" \
+ "paddw %%mm7, %%mm3 \n\t" \
+ "psraw %6, %%mm3 \n\t" \
+ "movq %%mm3, 16(%2) \n\t" \
+ "add %3, %1 \n\t" \
+ "add $24, %2 \n\t" \
+ "decl %0 \n\t" \
+ "jnz 1b \n\t" \
+ : "+r"(h), "+r" (src), "+r" (dst) \
+ : "r"(src_stride), "r"(3*src_stride), \
+ "m"(rnd), "m"(shift) \
+ : "memory" \
+ ); \
+}
+
+/**
+ * Macro to build the horizontal 16bits version of vc1_put_shift[13].
+ * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
+ *
+ * @param NAME Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4) \
+static void \
+vc1_put_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
+ const int16_t *src, int rnd) \
+{ \
+ int h = 8; \
+ src -= 1; \
+ rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
+ __asm__ volatile( \
+ LOAD_ROUNDER_MMX("%4") \
+ "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
+ "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
+ ASMALIGN(3) \
+ "1: \n\t" \
+ MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
+ NORMALIZE_MMX("$7") \
+ /* Remove bias */ \
+ "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
+ "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
+ TRANSFER_DO_PACK \
+ "add $24, %1 \n\t" \
+ "add %3, %2 \n\t" \
+ "decl %0 \n\t" \
+ "jnz 1b \n\t" \
+ : "+r"(h), "+r" (src), "+r" (dst) \
+ : "r"(stride), "m"(rnd) \
+ : "memory" \
+ ); \
+}
+
+/**
+ * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
+ * Here, offset=src_stride. Parameters passed A1 to A4 must use
+ * %3 (offset) and %4 (3*offset).
+ *
+ * @param NAME Either 1 or 3
+ * @see MSPEL_FILTER13_CORE for information on A1->A4
+ */
+#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4) \
+static void \
+vc1_put_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
+ x86_reg stride, int rnd, x86_reg offset) \
+{ \
+ int h = 8; \
+ src -= offset; \
+ rnd = 32-rnd; \
+ __asm__ volatile ( \
+ LOAD_ROUNDER_MMX("%6") \
+ "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
+ "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
+ ASMALIGN(3) \
+ "1: \n\t" \
+ MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
+ NORMALIZE_MMX("$6") \
+ TRANSFER_DO_PACK \
+ "add %5, %1 \n\t" \
+ "add %5, %2 \n\t" \
+ "decl %0 \n\t" \
+ "jnz 1b \n\t" \
+ : "+r"(h), "+r" (src), "+r" (dst) \
+ : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
+ : "memory" \
+ ); \
+}
+
+/** 1/4 shift bicubic interpolation */
+MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
+MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
+MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)")
+
+/** 3/4 shift bicubic interpolation */
+MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
+MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
+MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)")
+
+typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
+typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
+typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
+
+/**
+ * Interpolates fractional pel values by applying proper vertical then
+ * horizontal filter.
+ *
+ * @param dst Destination buffer for interpolated pels.
+ * @param src Source buffer.
+ * @param stride Stride for both src and dst buffers.
+ * @param hmode Horizontal filter (expressed in quarter pixels shift).
+ * @param hmode Vertical filter.
+ * @param rnd Rounding bias.
+ */
+static void vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,
+ int hmode, int vmode, int rnd)
+{
+ static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =
+ { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };
+ static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =
+ { NULL, vc1_put_hor_16b_shift1_mmx, vc1_put_hor_16b_shift2_mmx, vc1_put_hor_16b_shift3_mmx };
+ static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =
+ { NULL, vc1_put_shift1_mmx, vc1_put_shift2_mmx, vc1_put_shift3_mmx };
+
+ __asm__ volatile(
+ "pxor %%mm0, %%mm0 \n\t"
+ ::: "memory"
+ );
+
+ if (vmode) { /* Vertical filter to apply */
+ if (hmode) { /* Horizontal filter to apply, output to tmp */
+ static const int shift_value[] = { 0, 5, 1, 5 };
+ int shift = (shift_value[hmode]+shift_value[vmode])>>1;
+ int r;
+ DECLARE_ALIGNED_16(int16_t, tmp[12*8]);
+
+ r = (1<<(shift-1)) + rnd-1;
+ vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);
+
+ vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);
+ return;
+ }
+ else { /* No horizontal filter, output 8 lines to dst */
+ vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);
+ return;
+ }
+ }
+
+ /* Horizontal mode with no vertical mode */
+ vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);
+}
+
+void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
+
+/** Macro to ease bicubic filter interpolation functions declarations */
+#define DECLARE_FUNCTION(a, b) \
+static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
+ vc1_mspel_mc(dst, src, stride, a, b, rnd); \
+}
+
+DECLARE_FUNCTION(0, 1)
+DECLARE_FUNCTION(0, 2)
+DECLARE_FUNCTION(0, 3)
+
+DECLARE_FUNCTION(1, 0)
+DECLARE_FUNCTION(1, 1)
+DECLARE_FUNCTION(1, 2)
+DECLARE_FUNCTION(1, 3)
+
+DECLARE_FUNCTION(2, 0)
+DECLARE_FUNCTION(2, 1)
+DECLARE_FUNCTION(2, 2)
+DECLARE_FUNCTION(2, 3)
+
+DECLARE_FUNCTION(3, 0)
+DECLARE_FUNCTION(3, 1)
+DECLARE_FUNCTION(3, 2)
+DECLARE_FUNCTION(3, 3)
+
+void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
+ dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
+ dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
+
+ dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
+ dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
+
+ dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
+ dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
+ dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
+
+ dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
+ dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
+ dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
+ dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
+}
diff --git a/libavcodec/x86/vp3dsp_mmx.c b/libavcodec/x86/vp3dsp_mmx.c
new file mode 100644
index 0000000000..010bfc65d6
--- /dev/null
+++ b/libavcodec/x86/vp3dsp_mmx.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (C) 2004 the ffmpeg project
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file vp3dsp_mmx.c
+ * MMX-optimized functions cribbed from the original VP3 source code.
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/dsputil.h"
+#include "dsputil_mmx.h"
+
+extern const uint16_t ff_vp3_idct_data[];
+
+// this is off by one or two for some cases when filter_limit is greater than 63
+// in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
+// out: p1 in mm4, p2 in mm3
+#define VP3_LOOP_FILTER(flim) \
+ "movq %%mm6, %%mm7 \n\t" \
+ "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \
+ "psrlw $3, %%mm7 \n\t" \
+ "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \
+ "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \
+ "pxor %%mm4, %%mm2 \n\t" \
+ "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \
+ "movq %%mm2, %%mm5 \n\t" \
+ "paddb %%mm2, %%mm2 \n\t" \
+ "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \
+ "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \
+ "pcmpeqb %%mm0, %%mm0 \n\t" \
+ "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \
+ "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \
+ "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \
+ "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \
+ "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \
+ "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \
+ "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \
+ "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \
+ "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \
+ "psubusb %%mm7, %%mm6 \n\t" \
+ "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \
+\
+ "movq "#flim", %%mm5 \n\t" \
+ "pminub %%mm5, %%mm6 \n\t" \
+ "pminub %%mm5, %%mm7 \n\t" \
+ "movq %%mm6, %%mm0 \n\t" \
+ "movq %%mm7, %%mm1 \n\t" \
+ "paddb %%mm6, %%mm6 \n\t" \
+ "paddb %%mm7, %%mm7 \n\t" \
+ "pminub %%mm5, %%mm6 \n\t" \
+ "pminub %%mm5, %%mm7 \n\t" \
+ "psubb %%mm0, %%mm6 \n\t" \
+ "psubb %%mm1, %%mm7 \n\t" \
+ "paddusb %%mm7, %%mm4 \n\t" \
+ "psubusb %%mm6, %%mm4 \n\t" \
+ "psubusb %%mm7, %%mm3 \n\t" \
+ "paddusb %%mm6, %%mm3 \n\t"
+
+#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \
+ "movd "#mm", %0 \n\t" \
+ "movw %w0, -1"#dst0" \n\t" \
+ "psrlq $32, "#mm" \n\t" \
+ "shr $16, %0 \n\t" \
+ "movw %w0, -1"#dst1" \n\t" \
+ "movd "#mm", %0 \n\t" \
+ "movw %w0, -1"#dst2" \n\t" \
+ "shr $16, %0 \n\t" \
+ "movw %w0, -1"#dst3" \n\t"
+
+void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
+{
+ __asm__ volatile(
+ "movq %0, %%mm6 \n\t"
+ "movq %1, %%mm4 \n\t"
+ "movq %2, %%mm2 \n\t"
+ "movq %3, %%mm1 \n\t"
+
+ VP3_LOOP_FILTER(%4)
+
+ "movq %%mm4, %1 \n\t"
+ "movq %%mm3, %2 \n\t"
+
+ : "+m" (*(uint64_t*)(src - 2*stride)),
+ "+m" (*(uint64_t*)(src - 1*stride)),
+ "+m" (*(uint64_t*)(src + 0*stride)),
+ "+m" (*(uint64_t*)(src + 1*stride))
+ : "m"(*(uint64_t*)(bounding_values+129))
+ );
+}
+
+void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
+{
+ x86_reg tmp;
+
+ __asm__ volatile(
+ "movd -2(%1), %%mm6 \n\t"
+ "movd -2(%1,%3), %%mm0 \n\t"
+ "movd -2(%1,%3,2), %%mm1 \n\t"
+ "movd -2(%1,%4), %%mm4 \n\t"
+
+ TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2)
+ VP3_LOOP_FILTER(%5)
+ SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q)
+
+ STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4)
+ STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5)
+
+ : "=&r"(tmp)
+ : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride),
+ "m"(*(uint64_t*)(bounding_values+129))
+ : "memory"
+ );
+}
+
+/* from original comments: The Macro does IDct on 4 1-D Dcts */
+#define BeginIDCT() \
+ "movq "I(3)", %%mm2 \n\t" \
+ "movq "C(3)", %%mm6 \n\t" \
+ "movq %%mm2, %%mm4 \n\t" \
+ "movq "J(5)", %%mm7 \n\t" \
+ "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \
+ "movq "C(5)", %%mm1 \n\t" \
+ "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \
+ "movq %%mm1, %%mm5 \n\t" \
+ "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \
+ "movq "I(1)", %%mm3 \n\t" \
+ "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \
+ "movq "C(1)", %%mm0 \n\t" \
+ "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \
+ "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \
+ "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \
+ "movq "J(7)", %%mm1 \n\t" \
+ "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \
+ "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \
+ "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \
+ "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \
+ "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \
+ "movq "C(7)", %%mm7 \n\t" \
+ "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \
+ "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \
+ "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \
+ "movq "I(2)", %%mm2 \n\t" \
+ "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \
+ "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \
+ "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \
+ "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \
+ "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \
+ "movq "J(6)", %%mm5 \n\t" \
+ "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \
+ "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \
+ "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \
+ "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \
+ "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \
+ "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \
+ "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \
+ "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \
+ "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \
+ "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \
+ "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \
+ "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \
+ "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \
+ "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \
+ "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \
+ "movq "C(4)", %%mm4 \n\t" \
+ "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \
+ "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \
+ "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \
+ "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \
+ "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \
+ "movq "I(0)", %%mm6 \n\t" \
+ "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \
+ "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \
+ "movq "J(4)", %%mm3 \n\t" \
+ "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \
+ "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \
+ "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \
+ "movq %%mm6, %%mm0 \n\t" \
+ "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \
+ "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \
+ "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \
+ "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \
+ "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \
+ "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \
+ "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \
+ "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \
+ "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \
+ "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \
+ "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \
+ "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \
+ "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */
+
+/* RowIDCT gets ready to transpose */
+#define RowIDCT() \
+ BeginIDCT() \
+ "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \
+ "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \
+ "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \
+ "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \
+ "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \
+ "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \
+ "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \
+ "paddsw %%mm3, %%mm3 \n\t" \
+ "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \
+ "paddsw %%mm5, %%mm5 \n\t" \
+ "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \
+ "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \
+ "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \
+ "paddsw %%mm0, %%mm0 \n\t" \
+ "movq %%mm1, "I(1)"\n\t" /* save R1 */ \
+ "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */
+
+/* Column IDCT normalizes and stores final results */
+#define ColumnIDCT() \
+ BeginIDCT() \
+ "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \
+ "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \
+ "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \
+ "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \
+ "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \
+ "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \
+ "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \
+ "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \
+ "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \
+ "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \
+ "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \
+ "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \
+ "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \
+ "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \
+ "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \
+ "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \
+ "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \
+ "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \
+ "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \
+ "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \
+ "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \
+ "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \
+ "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \
+ "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \
+ "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \
+ "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \
+ "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \
+ "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \
+ "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \
+ "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \
+ "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \
+ "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \
+ "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \
+ "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \
+ "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */
+
+/* Following macro does two 4x4 transposes in place.
+
+ At entry (we assume):
+
+ r0 = a3 a2 a1 a0
+ I(1) = b3 b2 b1 b0
+ r2 = c3 c2 c1 c0
+ r3 = d3 d2 d1 d0
+
+ r4 = e3 e2 e1 e0
+ r5 = f3 f2 f1 f0
+ r6 = g3 g2 g1 g0
+ r7 = h3 h2 h1 h0
+
+ At exit, we have:
+
+ I(0) = d0 c0 b0 a0
+ I(1) = d1 c1 b1 a1
+ I(2) = d2 c2 b2 a2
+ I(3) = d3 c3 b3 a3
+
+ J(4) = h0 g0 f0 e0
+ J(5) = h1 g1 f1 e1
+ J(6) = h2 g2 f2 e2
+ J(7) = h3 g3 f3 e3
+
+ I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+ J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
+
+ Since r1 is free at entry, we calculate the Js first. */
+#define Transpose() \
+ "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \
+ "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \
+ "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \
+ "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \
+ "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \
+ "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \
+ "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \
+ "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \
+ "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \
+ "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \
+ "movq %%mm4, "J(4)"\n\t" \
+ "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \
+ "movq %%mm5, "J(5)"\n\t" \
+ "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \
+ "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \
+ "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \
+ "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \
+ "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \
+ "movq %%mm6, "J(7)"\n\t" \
+ "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \
+ "movq %%mm1, "J(6)"\n\t" \
+ "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \
+ "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \
+ "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \
+ "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \
+ "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \
+ "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \
+ "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \
+ "movq %%mm0, "I(0)"\n\t" \
+ "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \
+ "movq %%mm1, "I(1)"\n\t" \
+ "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \
+ "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \
+ "movq %%mm4, "I(3)"\n\t" \
+ "movq %%mm2, "I(2)"\n\t"
+
+void ff_vp3_idct_mmx(int16_t *output_data)
+{
+ /* eax = quantized input
+ * ebx = dequantizer matrix
+ * ecx = IDCT constants
+ * M(I) = ecx + MaskOffset(0) + I * 8
+ * C(I) = ecx + CosineOffset(32) + (I-1) * 8
+ * edx = output
+ * r0..r7 = mm0..mm7
+ */
+
+#define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
+#define OC_8 "%2"
+
+ /* at this point, function has completed dequantization + dezigzag +
+ * partial transposition; now do the idct itself */
+#define I(x) AV_STRINGIFY(16* x )"(%0)"
+#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)"
+
+ __asm__ volatile (
+ RowIDCT()
+ Transpose()
+
+#undef I
+#undef J
+#define I(x) AV_STRINGIFY(16* x + 64)"(%0)"
+#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)"
+
+ RowIDCT()
+ Transpose()
+
+#undef I
+#undef J
+#define I(x) AV_STRINGIFY(16*x)"(%0)"
+#define J(x) AV_STRINGIFY(16*x)"(%0)"
+
+ ColumnIDCT()
+
+#undef I
+#undef J
+#define I(x) AV_STRINGIFY(16*x + 8)"(%0)"
+#define J(x) AV_STRINGIFY(16*x + 8)"(%0)"
+
+ ColumnIDCT()
+ :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
+ );
+#undef I
+#undef J
+
+}
+
+void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_vp3_idct_mmx(block);
+ put_signed_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_vp3_idct_mmx(block);
+ add_pixels_clamped_mmx(block, dest, line_size);
+}
diff --git a/libavcodec/x86/vp3dsp_mmx.h b/libavcodec/x86/vp3dsp_mmx.h
new file mode 100644
index 0000000000..e565a33023
--- /dev/null
+++ b/libavcodec/x86/vp3dsp_mmx.h
@@ -0,0 +1,35 @@
+/*
+ * vp3dsp MMX function declarations
+ * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP3DSP_MMX_H
+#define AVCODEC_X86_VP3DSP_MMX_H
+
+#include <stdint.h>
+#include "libavcodec/dsputil.h"
+
+void ff_vp3_idct_mmx(int16_t *data);
+void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
+
+void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
+void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
+
+#endif /* AVCODEC_X86_VP3DSP_MMX_H */
diff --git a/libavcodec/x86/vp3dsp_sse2.c b/libavcodec/x86/vp3dsp_sse2.c
new file mode 100644
index 0000000000..82670c74ef
--- /dev/null
+++ b/libavcodec/x86/vp3dsp_sse2.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (C) 2004 the ffmpeg project
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file vp3dsp_sse2.c
+ * SSE2-optimized functions cribbed from the original VP3 source code.
+ */
+
+#include "libavcodec/dsputil.h"
+#include "dsputil_mmx.h"
+
+DECLARE_ALIGNED_16(const uint16_t, ff_vp3_idct_data[7 * 8]) =
+{
+ 64277,64277,64277,64277,64277,64277,64277,64277,
+ 60547,60547,60547,60547,60547,60547,60547,60547,
+ 54491,54491,54491,54491,54491,54491,54491,54491,
+ 46341,46341,46341,46341,46341,46341,46341,46341,
+ 36410,36410,36410,36410,36410,36410,36410,36410,
+ 25080,25080,25080,25080,25080,25080,25080,25080,
+ 12785,12785,12785,12785,12785,12785,12785,12785
+};
+
+
+#define VP3_1D_IDCT_SSE2(ADD, SHIFT) \
+ "movdqa "I(3)", %%xmm2 \n\t" /* xmm2 = i3 */ \
+ "movdqa "C(3)", %%xmm6 \n\t" /* xmm6 = c3 */ \
+ "movdqa %%xmm2, %%xmm4 \n\t" /* xmm4 = i3 */ \
+ "movdqa "I(5)", %%xmm7 \n\t" /* xmm7 = i5 */ \
+ "pmulhw %%xmm6, %%xmm4 \n\t" /* xmm4 = c3 * i3 - i3 */ \
+ "movdqa "C(5)", %%xmm1 \n\t" /* xmm1 = c5 */ \
+ "pmulhw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 - i5 */ \
+ "movdqa %%xmm1, %%xmm5 \n\t" /* xmm5 = c5 */ \
+ "pmulhw %%xmm2, %%xmm1 \n\t" /* xmm1 = c5 * i3 - i3 */ \
+ "movdqa "I(1)", %%xmm3 \n\t" /* xmm3 = i1 */ \
+ "pmulhw %%xmm7, %%xmm5 \n\t" /* xmm5 = c5 * i5 - i5 */ \
+ "movdqa "C(1)", %%xmm0 \n\t" /* xmm0 = c1 */ \
+ "paddw %%xmm2, %%xmm4 \n\t" /* xmm4 = c3 * i3 */ \
+ "paddw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 */ \
+ "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = c5 * i3 */ \
+ "movdqa "I(7)", %%xmm1 \n\t" /* xmm1 = i7 */ \
+ "paddw %%xmm5, %%xmm7 \n\t" /* xmm7 = c5 * i5 */ \
+ "movdqa %%xmm0, %%xmm5 \n\t" /* xmm5 = c1 */ \
+ "pmulhw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 - i1 */ \
+ "paddsw %%xmm7, %%xmm4 \n\t" /* xmm4 = c3 * i3 + c5 * i5 = C */ \
+ "pmulhw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 - i7 */ \
+ "movdqa "C(7)", %%xmm7 \n\t" /* xmm7 = c7 */ \
+ "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = c3 * i5 - c5 * i3 = D */ \
+ "paddw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 */ \
+ "pmulhw %%xmm7, %%xmm3 \n\t" /* xmm3 = c7 * i1 */ \
+ "movdqa "I(2)", %%xmm2 \n\t" /* xmm2 = i2 */ \
+ "pmulhw %%xmm1, %%xmm7 \n\t" /* xmm7 = c7 * i7 */ \
+ "paddw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 */ \
+ "movdqa %%xmm2, %%xmm1 \n\t" /* xmm1 = i2 */ \
+ "pmulhw "C(2)", %%xmm2 \n\t" /* xmm2 = i2 * c2 -i2 */ \
+ "psubsw %%xmm5, %%xmm3 \n\t" /* xmm3 = c7 * i1 - c1 * i7 = B */ \
+ "movdqa "I(6)", %%xmm5 \n\t" /* xmm5 = i6 */ \
+ "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = c1 * i1 + c7 * i7 = A */ \
+ "movdqa %%xmm5, %%xmm7 \n\t" /* xmm7 = i6 */ \
+ "psubsw %%xmm4, %%xmm0 \n\t" /* xmm0 = A - C */ \
+ "pmulhw "C(2)", %%xmm5 \n\t" /* xmm5 = c2 * i6 - i6 */ \
+ "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = i2 * c2 */ \
+ "pmulhw "C(6)", %%xmm1 \n\t" /* xmm1 = c6 * i2 */ \
+ "paddsw %%xmm4, %%xmm4 \n\t" /* xmm4 = C + C */ \
+ "paddsw %%xmm0, %%xmm4 \n\t" /* xmm4 = A + C = C. */ \
+ "psubsw %%xmm6, %%xmm3 \n\t" /* xmm3 = B - D */ \
+ "paddw %%xmm7, %%xmm5 \n\t" /* xmm5 = c2 * i6 */ \
+ "paddsw %%xmm6, %%xmm6 \n\t" /* xmm6 = D + D */ \
+ "pmulhw "C(6)", %%xmm7 \n\t" /* xmm7 = c6 * i6 */ \
+ "paddsw %%xmm3, %%xmm6 \n\t" /* xmm6 = B + D = D. */ \
+ "movdqa %%xmm4, "I(1)" \n\t" /* Save C. at I(1) */ \
+ "psubsw %%xmm5, %%xmm1 \n\t" /* xmm1 = c6 * i2 - c2 * i6 = H */ \
+ "movdqa "C(4)", %%xmm4 \n\t" /* xmm4 = c4 */ \
+ "movdqa %%xmm3, %%xmm5 \n\t" /* xmm5 = B - D */ \
+ "pmulhw %%xmm4, %%xmm3 \n\t" /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
+ "paddsw %%xmm2, %%xmm7 \n\t" /* xmm7 = c2 * i2 + c6 * i6 = G */ \
+ "movdqa %%xmm6, "I(2)" \n\t" /* Save D. at I(2) */ \
+ "movdqa %%xmm0, %%xmm2 \n\t" /* xmm2 = A - C */ \
+ "movdqa "I(0)", %%xmm6 \n\t" /* xmm6 = i0 */ \
+ "pmulhw %%xmm4, %%xmm0 \n\t" /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
+ "paddw %%xmm3, %%xmm5 \n\t" /* xmm5 = c4 * ( B - D ) = B. */ \
+ "movdqa "I(4)", %%xmm3 \n\t" /* xmm3 = i4 */ \
+ "psubsw %%xmm1, %%xmm5 \n\t" /* xmm5 = B. - H = B.. */ \
+ "paddw %%xmm0, %%xmm2 \n\t" /* xmm2 = c4 * ( A - C) = A. */ \
+ "psubsw %%xmm3, %%xmm6 \n\t" /* xmm6 = i0 - i4 */ \
+ "movdqa %%xmm6, %%xmm0 \n\t" /* xmm0 = i0 - i4 */ \
+ "pmulhw %%xmm4, %%xmm6 \n\t" /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \
+ "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = i4 + i4 */ \
+ "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H + H */ \
+ "paddsw %%xmm0, %%xmm3 \n\t" /* xmm3 = i0 + i4 */ \
+ "paddsw %%xmm5, %%xmm1 \n\t" /* xmm1 = B. + H = H. */ \
+ "pmulhw %%xmm3, %%xmm4 \n\t" /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
+ "paddw %%xmm0, %%xmm6 \n\t" /* xmm6 = c4 * ( i0 - i4 ) */ \
+ "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = F - A. = F. */ \
+ "paddsw %%xmm2, %%xmm2 \n\t" /* xmm2 = A. + A. */ \
+ "movdqa "I(1)", %%xmm0 \n\t" /* Load C. from I(1) */ \
+ "paddsw %%xmm6, %%xmm2 \n\t" /* xmm2 = F + A. = A.. */ \
+ "paddw %%xmm3, %%xmm4 \n\t" /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
+ "psubsw %%xmm1, %%xmm2 \n\t" /* xmm2 = A.. - H. = R2 */ \
+ ADD(%%xmm2) /* Adjust R2 and R1 before shifting */ \
+ "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H. + H. */ \
+ "paddsw %%xmm2, %%xmm1 \n\t" /* xmm1 = A.. + H. = R1 */ \
+ SHIFT(%%xmm2) /* xmm2 = op2 */ \
+ "psubsw %%xmm7, %%xmm4 \n\t" /* xmm4 = E - G = E. */ \
+ SHIFT(%%xmm1) /* xmm1 = op1 */ \
+ "movdqa "I(2)", %%xmm3 \n\t" /* Load D. from I(2) */ \
+ "paddsw %%xmm7, %%xmm7 \n\t" /* xmm7 = G + G */ \
+ "paddsw %%xmm4, %%xmm7 \n\t" /* xmm7 = E + G = G. */ \
+ "psubsw %%xmm3, %%xmm4 \n\t" /* xmm4 = E. - D. = R4 */ \
+ ADD(%%xmm4) /* Adjust R4 and R3 before shifting */ \
+ "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = D. + D. */ \
+ "paddsw %%xmm4, %%xmm3 \n\t" /* xmm3 = E. + D. = R3 */ \
+ SHIFT(%%xmm4) /* xmm4 = op4 */ \
+ "psubsw %%xmm5, %%xmm6 \n\t" /* xmm6 = F. - B..= R6 */ \
+ SHIFT(%%xmm3) /* xmm3 = op3 */ \
+ ADD(%%xmm6) /* Adjust R6 and R5 before shifting */ \
+ "paddsw %%xmm5, %%xmm5 \n\t" /* xmm5 = B.. + B.. */ \
+ "paddsw %%xmm6, %%xmm5 \n\t" /* xmm5 = F. + B.. = R5 */ \
+ SHIFT(%%xmm6) /* xmm6 = op6 */ \
+ SHIFT(%%xmm5) /* xmm5 = op5 */ \
+ "psubsw %%xmm0, %%xmm7 \n\t" /* xmm7 = G. - C. = R7 */ \
+ ADD(%%xmm7) /* Adjust R7 and R0 before shifting */ \
+ "paddsw %%xmm0, %%xmm0 \n\t" /* xmm0 = C. + C. */ \
+ "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = G. + C. */ \
+ SHIFT(%%xmm7) /* xmm7 = op7 */ \
+ SHIFT(%%xmm0) /* xmm0 = op0 */
+
+#define PUT_BLOCK(r0, r1, r2, r3, r4, r5, r6, r7) \
+ "movdqa " #r0 ", " O(0) "\n\t" \
+ "movdqa " #r1 ", " O(1) "\n\t" \
+ "movdqa " #r2 ", " O(2) "\n\t" \
+ "movdqa " #r3 ", " O(3) "\n\t" \
+ "movdqa " #r4 ", " O(4) "\n\t" \
+ "movdqa " #r5 ", " O(5) "\n\t" \
+ "movdqa " #r6 ", " O(6) "\n\t" \
+ "movdqa " #r7 ", " O(7) "\n\t"
+
+#define NOP(xmm)
+#define SHIFT4(xmm) "psraw $4, "#xmm"\n\t"
+#define ADD8(xmm) "paddsw %2, "#xmm"\n\t"
+
+void ff_vp3_idct_sse2(int16_t *input_data)
+{
+#define I(x) AV_STRINGIFY(16*x)"(%0)"
+#define O(x) I(x)
+#define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
+
+ __asm__ volatile (
+ VP3_1D_IDCT_SSE2(NOP, NOP)
+
+ TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%0))
+ PUT_BLOCK(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)
+
+ VP3_1D_IDCT_SSE2(ADD8, SHIFT4)
+ PUT_BLOCK(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
+ :: "r"(input_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
+ );
+}
+
+void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_vp3_idct_sse2(block);
+ put_signed_pixels_clamped_mmx(block, dest, line_size);
+}
+
+void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_vp3_idct_sse2(block);
+ add_pixels_clamped_mmx(block, dest, line_size);
+}
diff --git a/libavcodec/x86/vp3dsp_sse2.h b/libavcodec/x86/vp3dsp_sse2.h
new file mode 100644
index 0000000000..9094620eb2
--- /dev/null
+++ b/libavcodec/x86/vp3dsp_sse2.h
@@ -0,0 +1,31 @@
+/*
+ * vp3dsp SSE2 function declarations
+ * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VP3DSP_SSE2_H
+#define AVCODEC_X86_VP3DSP_SSE2_H
+
+#include "libavcodec/dsputil.h"
+
+void ff_vp3_idct_sse2(int16_t *input_data);
+void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
+
+#endif /* AVCODEC_X86_VP3DSP_SSE2_H */
diff --git a/libavcodec/x86/x86inc.asm b/libavcodec/x86/x86inc.asm
new file mode 100644
index 0000000000..3729b5b101
--- /dev/null
+++ b/libavcodec/x86/x86inc.asm
@@ -0,0 +1,540 @@
+;*****************************************************************************
+;* x86inc.asm
+;*****************************************************************************
+;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+; FIXME: All of the 64bit asm functions that take a stride as an argument
+; via register, assume that the high dword of that register is filled with 0.
+; This is true in practice (since we never do any 64bit arithmetic on strides,
+; and x264's strides are all positive), but is not guaranteed by the ABI.
+
+; Name of the .rodata section.
+; Kludge: Something on OS X fails to align .rodata even given an align attribute,
+; so use a different read-only section.
+%macro SECTION_RODATA 0
+ %ifidn __OUTPUT_FORMAT__,macho64
+ SECTION .text align=16
+ %elifidn __OUTPUT_FORMAT__,macho
+ SECTION .text align=16
+ fakegot:
+ %else
+ SECTION .rodata align=16
+ %endif
+%endmacro
+
+; PIC support macros. All these macros are totally harmless when PIC is
+; not defined but can ruin everything if misused in PIC mode. On x86_32, shared
+; objects cannot directly access global variables by address, they need to
+; go through the GOT (global offset table). Most OSes do not care about it
+; and let you load non-shared .so objects (Linux, Win32...). However, OS X
+; requires PIC code in its .dylib objects.
+;
+; - GLOBAL should be used as a suffix for global addressing, eg.
+; picgetgot ebx
+; mov eax, [foo GLOBAL]
+; instead of
+; mov eax, [foo]
+;
+; - picgetgot computes the GOT address into the given register in PIC
+; mode, otherwise does nothing. You need to do this before using GLOBAL.
+; Before in both execution order and compiled code order (so GLOBAL knows
+; which register the GOT is in).
+
+%ifndef PIC
+ %define GLOBAL
+ %macro picgetgot 1
+ %endmacro
+%elifdef ARCH_X86_64
+ %define PIC64
+ %define GLOBAL wrt rip
+ %macro picgetgot 1
+ %endmacro
+%else
+ %define PIC32
+ %ifidn __OUTPUT_FORMAT__,macho
+ ; There is no real global offset table on OS X, but we still
+ ; need to reference our variables by offset.
+ %macro picgetgot 1
+ call %%getgot
+ %%getgot:
+ pop %1
+ add %1, $$ - %%getgot
+ %undef GLOBAL
+ %define GLOBAL + %1 - fakegot
+ %endmacro
+ %else ; elf
+ extern _GLOBAL_OFFSET_TABLE_
+ %macro picgetgot 1
+ call %%getgot
+ %%getgot:
+ pop %1
+ add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
+ %undef GLOBAL
+ %define GLOBAL + %1 wrt ..gotoff
+ %endmacro
+ %endif
+%endif
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
+; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
+; %4 = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE
+
+; REP_RET:
+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
+; which are slow when a normal ret follows a branch.
+
+%macro DECLARE_REG 6
+ %define r%1q %2
+ %define r%1d %3
+ %define r%1w %4
+ %define r%1b %5
+ %define r%1m %6
+ %define r%1 %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 2
+ %define r%1q r%1
+ %define e%1q r%1
+ %define r%1d e%1
+ %define e%1d e%1
+ %define r%1w %1
+ %define e%1w %1
+ %define r%1b %2
+ %define e%1b %2
+%ifndef ARCH_X86_64
+ %define r%1 e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al
+DECLARE_REG_SIZE bx, bl
+DECLARE_REG_SIZE cx, cl
+DECLARE_REG_SIZE dx, dl
+DECLARE_REG_SIZE si, sil
+DECLARE_REG_SIZE di, dil
+DECLARE_REG_SIZE bp, bpl
+
+%ifdef ARCH_X86_64
+ %define gprsize 8
+%else
+ %define gprsize 4
+%endif
+
+%macro PUSH 1
+ push %1
+ %assign stack_offset stack_offset+gprsize
+%endmacro
+
+%macro POP 1
+ pop %1
+ %assign stack_offset stack_offset-gprsize
+%endmacro
+
+%macro SUB 2
+ sub %1, %2
+ %ifidn %1, rsp
+ %assign stack_offset stack_offset+(%2)
+ %endif
+%endmacro
+
+%macro ADD 2
+ add %1, %2
+ %ifidn %1, rsp
+ %assign stack_offset stack_offset-(%2)
+ %endif
+%endmacro
+
+%macro movifnidn 2
+ %ifnidn %1, %2
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movsxdifnidn 2
+ %ifnidn %1, %2
+ movsxd %1, %2
+ %endif
+%endmacro
+
+%macro ASSERT 1
+ %if (%1) == 0
+ %error assert failed
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+ %ifdef n_arg_names
+ %assign %%i 0
+ %rep n_arg_names
+ CAT_UNDEF arg_name %+ %%i, q
+ CAT_UNDEF arg_name %+ %%i, d
+ CAT_UNDEF arg_name %+ %%i, w
+ CAT_UNDEF arg_name %+ %%i, b
+ CAT_UNDEF arg_name, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+
+ %assign %%i 0
+ %rep %0
+ %xdefine %1q r %+ %%i %+ q
+ %xdefine %1d r %+ %%i %+ d
+ %xdefine %1w r %+ %%i %+ w
+ %xdefine %1b r %+ %%i %+ b
+ CAT_XDEFINE arg_name, %%i, %1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+ %assign n_arg_names %%i
+%endmacro
+
+%ifdef ARCH_X86_64 ;==========================================================
+%ifidn __OUTPUT_FORMAT__,win32
+
+DECLARE_REG 0, rcx, ecx, cx, cl, ecx
+DECLARE_REG 1, rdx, edx, dx, dl, edx
+DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
+DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
+DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
+DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
+DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
+%define r7m [rsp + stack_offset + 64]
+%define r8m [rsp + stack_offset + 72]
+
+%macro LOAD_IF_USED 2 ; reg_id, number_of_args
+ %if %1 < %2
+ mov r%1, [rsp + 8 + %1*8]
+ %endif
+%endmacro
+
+%else ;=======================================================================
+
+DECLARE_REG 0, rdi, edi, di, dil, edi
+DECLARE_REG 1, rsi, esi, si, sil, esi
+DECLARE_REG 2, rdx, edx, dx, dl, edx
+DECLARE_REG 3, rcx, ecx, cx, cl, ecx
+DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
+DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
+DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
+%define r7m [rsp + stack_offset + 16]
+%define r8m [rsp + stack_offset + 24]
+
+%macro LOAD_IF_USED 2 ; reg_id, number_of_args
+ %if %1 < %2
+ mov r%1, [rsp - 40 + %1*8]
+ %endif
+%endmacro
+
+%endif ; !WIN64
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+ ASSERT %2 >= %1
+ ASSERT %2 <= 7
+ %assign stack_offset 0
+%ifidn __OUTPUT_FORMAT__,win32
+ LOAD_IF_USED 4, %1
+ LOAD_IF_USED 5, %1
+%endif
+ LOAD_IF_USED 6, %1
+ DEFINE_ARGS %4
+%endmacro
+
+%macro RET 0
+ ret
+%endmacro
+
+%macro REP_RET 0
+ rep ret
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
+DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
+DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
+DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
+DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
+DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
+DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
+%define r7m [esp + stack_offset + 32]
+%define r8m [esp + stack_offset + 36]
+%define rsp esp
+
+%macro PUSH_IF_USED 1 ; reg_id
+ %if %1 < regs_used
+ push r%1
+ %assign stack_offset stack_offset+4
+ %endif
+%endmacro
+
+%macro POP_IF_USED 1 ; reg_id
+ %if %1 < regs_used
+ pop r%1
+ %endif
+%endmacro
+
+%macro LOAD_IF_USED 2 ; reg_id, number_of_args
+ %if %1 < %2
+ mov r%1, [esp + stack_offset + 4 + %1*4]
+ %endif
+%endmacro
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
+ ASSERT %2 >= %1
+ %assign stack_offset 0
+ %assign regs_used %2
+ %ifdef PIC
+ %if %3
+ %assign regs_used regs_used+1
+ %endif
+ %endif
+ ASSERT regs_used <= 7
+ PUSH_IF_USED 3
+ PUSH_IF_USED 4
+ PUSH_IF_USED 5
+ PUSH_IF_USED 6
+ LOAD_IF_USED 0, %1
+ LOAD_IF_USED 1, %1
+ LOAD_IF_USED 2, %1
+ LOAD_IF_USED 3, %1
+ LOAD_IF_USED 4, %1
+ LOAD_IF_USED 5, %1
+ LOAD_IF_USED 6, %1
+ %if %3
+ picgetgot r%2
+ %endif
+ DEFINE_ARGS %4
+%endmacro
+
+%macro RET 0
+ POP_IF_USED 6
+ POP_IF_USED 5
+ POP_IF_USED 4
+ POP_IF_USED 3
+ ret
+%endmacro
+
+%macro REP_RET 0
+ %if regs_used > 3
+ RET
+ %else
+ rep ret
+ %endif
+%endmacro
+
+%endif ;======================================================================
+
+
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Symbol prefix for C linkage
+%macro cglobal 1-2+
+ %xdefine %1 ff_%1
+ %ifdef PREFIX
+ %xdefine %1 _ %+ %1
+ %endif
+ %ifidn __OUTPUT_FORMAT__,elf
+ global %1:function hidden
+ %else
+ global %1
+ %endif
+ align function_align
+ %1:
+ RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+ %if %0 > 1
+ PROLOGUE %2
+ %endif
+%endmacro
+
+%macro cextern 1
+ %ifdef PREFIX
+ extern _%1
+ %define %1 _%1
+ %else
+ extern %1
+ %endif
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+%assign FENC_STRIDE 16
+%assign FDEC_STRIDE 32
+
+; merge mmx and sse*
+
+%macro CAT_XDEFINE 3
+ %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+ %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0
+ %define RESET_MM_PERMUTATION INIT_MMX
+ %define mmsize 8
+ %define num_mmregs 8
+ %define mova movq
+ %define movu movq
+ %define movh movd
+ %define movnt movntq
+ %assign %%i 0
+ %rep 8
+ CAT_XDEFINE m, %%i, mm %+ %%i
+ CAT_XDEFINE nmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %rep 8
+ CAT_UNDEF m, %%i
+ CAT_UNDEF nmm, %%i
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro INIT_XMM 0
+ %define RESET_MM_PERMUTATION INIT_XMM
+ %define mmsize 16
+ %define num_mmregs 8
+ %ifdef ARCH_X86_64
+ %define num_mmregs 16
+ %endif
+ %define mova movdqa
+ %define movu movdqu
+ %define movh movq
+ %define movnt movntdq
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, xmm %+ %%i
+ CAT_XDEFINE nxmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+INIT_MMX
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+ %xdefine tmp%2 m%2
+ %xdefine ntmp%2 nm%2
+ %rotate 2
+%endrep
+%rep %0/2
+ %xdefine m%1 tmp%2
+ %xdefine nm%1 ntmp%2
+ %undef tmp%2
+ %undef ntmp%2
+ %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
+%rep %0-1
+%ifdef m%1
+ %xdefine tmp m%1
+ %xdefine m%1 m%2
+ %xdefine m%2 tmp
+ CAT_XDEFINE n, m%1, %1
+ CAT_XDEFINE n, m%2, %2
+%else
+ ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
+ ; Be careful using this mode in nested macros though, as in some cases there may be
+ ; other copies of m# that have already been dereferenced and don't get updated correctly.
+ %xdefine %%n1 n %+ %1
+ %xdefine %%n2 n %+ %2
+ %xdefine tmp m %+ %%n1
+ CAT_XDEFINE m, %%n1, m %+ %%n2
+ CAT_XDEFINE m, %%n2, tmp
+ CAT_XDEFINE n, m %+ %%n1, %%n1
+ CAT_XDEFINE n, m %+ %%n2, %%n2
+%endif
+ %undef tmp
+ %rotate 1
+%endrep
+%endmacro
+
+%macro SAVE_MM_PERMUTATION 1
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE %1_m, %%i, m %+ %%i
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1_m %+ %%i
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro call 1
+ call %1
+ %ifdef %1_m0
+ LOAD_MM_PERMUTATION %1
+ %endif
+%endmacro
+
+; substitutions which are functionally identical but reduce code size
+%define movdqa movaps
+%define movdqu movups
+