Merge remote-tracking branch 'qatar/master'

* qatar/master: libopenjpeg: support YUV and deep RGB pixel formats Fix typo in v410 decoder. vf_yadif: unset cur_buf on the input link. vf_overlay: ensure the overlay frame does not get leaked. vf_overlay: prevent premature freeing of cur_buf Support urlencoded http authentication credentials rtmp: Return an error when the client bandwidth is incorrect rtmp: Return proper error code in handle_server_bw rtmp: Return proper error code in handle_client_bw rtmp: Return proper error codes in handle_chunk_size lavr: x86: add missing vzeroupper in ff_mix_1_to_2_fltp_flt() vp8: Replace x*155/100 by x*101581>>16. vp3: don't use calls to inline asm in yasm code. x86/dsputil: put inline asm under HAVE_INLINE_ASM. dsputil_mmx: fix incorrect assembly code rtmp: Factorize the code by adding handle_invoke rtmp: Factorize the code by adding handle_chunk_size rtmp: Factorize the code by adding handle_ping rtmp: Factorize the code by adding handle_client_bw rtmp: Factorize the code by adding handle_server_bw Conflicts: libavcodec/libopenjpegdec.c libavcodec/x86/dsputil_mmx.c libavfilter/vf_overlay.c libavformat/Makefile libavformat/version.h Merged-by: Michael Niedermayer <michaelni@gmx.at>
author: Michael Niedermayer <michaelni@gmx.at> 2012-07-26 21:36:03 +0200
committer: Michael Niedermayer <michaelni@gmx.at> 2012-07-26 21:37:15 +0200
commit: 7333798c85837f1cf175f39bc4acb5664fa6cacc (patch)
tree: 60036638a0962b3cb966d62da2eda81f93ac3267 /libavcodec/x86
parent: 307a20cca216356aec30f5bb102c633169cbc0c1 (diff)
parent: 44dc9c6af0377faf2a99889d1f949e32a1102e84 (diff)
8 files changed, 160 insertions, 67 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index c2ee5a85e4..1488389572 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -85,6 +85,8 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF
 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
 
+#if HAVE_INLINE_ASM
+
 #define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
 
@@ -246,14 +248,14 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
     pix = pixels;
     /* unrolled loop */
     __asm__ volatile (
-        "movq        (%3), %%mm0          \n\t"
-        "movq       8(%3), %%mm1          \n\t"
-        "movq      16(%3), %%mm2          \n\t"
-        "movq      24(%3), %%mm3          \n\t"
-        "movq      32(%3), %%mm4          \n\t"
-        "movq      40(%3), %%mm5          \n\t"
-        "movq      48(%3), %%mm6          \n\t"
-        "movq      56(%3), %%mm7          \n\t"
+        "movq      (%3), %%mm0          \n\t"
+        "movq     8(%3), %%mm1          \n\t"
+        "movq    16(%3), %%mm2          \n\t"
+        "movq    24(%3), %%mm3          \n\t"
+        "movq    32(%3), %%mm4          \n\t"
+        "movq    40(%3), %%mm5          \n\t"
+        "movq    48(%3), %%mm6          \n\t"
+        "movq    56(%3), %%mm7          \n\t"
         "packuswb %%mm1, %%mm0          \n\t"
         "packuswb %%mm3, %%mm2          \n\t"
         "packuswb %%mm5, %%mm4          \n\t"
@@ -1856,6 +1858,8 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
   avg_pixels16_xy2_mmx(dst, src, stride, 16);
 }
 
+#endif /* HAVE_INLINE_ASM */
+
 #if HAVE_YASM
 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
                                 x86_reg linesize, x86_reg start_y,
@@ -1924,6 +1928,8 @@ static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
 }
 #endif /* HAVE_YASM */
 
+#if HAVE_INLINE_ASM
+
 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
                                    int linesize, int block_w, int block_h,
                                    int src_x, int src_y, int w, int h);
@@ -2093,6 +2099,8 @@ PREFETCH(prefetch_mmx2,  prefetcht0)
 PREFETCH(prefetch_3dnow, prefetch)
 #undef PREFETCH
 
+#endif /* HAVE_INLINE_ASM */
+
 #include "h264_qpel_mmx.c"
 
 void ff_put_h264_chroma_mc8_mmx_rnd  (uint8_t *dst, uint8_t *src,
@@ -2138,6 +2146,8 @@ CHROMA_MC(avg, 8, 10, sse2)
 CHROMA_MC(put, 8, 10, avx)
 CHROMA_MC(avg, 8, 10, avx)
 
+#if HAVE_INLINE_ASM
+
 /* CAVS-specific */
 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
 {
@@ -2574,6 +2584,8 @@ static void vector_clipf_sse(float *dst, const float *src,
     );
 }
 
+#endif /* HAVE_INLINE_ASM */
+
 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
                                     int order);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
@@ -2686,6 +2698,7 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
 {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
+#if HAVE_INLINE_ASM
     c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
     c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
@@ -2708,10 +2721,6 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
 #if ARCH_X86_32 || !HAVE_YASM
     c->gmc = gmc_mmx;
 #endif
-#if ARCH_X86_32 && HAVE_YASM
-    if (!high_bit_depth)
-        c->emulated_edge_mc = emulated_edge_mc_mmx;
-#endif
 
     c->add_bytes = add_bytes_mmx;
 
@@ -2722,8 +2731,14 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
         c->h263_v_loop_filter = h263_v_loop_filter_mmx;
         c->h263_h_loop_filter = h263_h_loop_filter_mmx;
     }
+#endif /* HAVE_INLINE_ASM */
 
 #if HAVE_YASM
+#if ARCH_X86_32
+    if (!high_bit_depth)
+        c->emulated_edge_mc = emulated_edge_mc_mmx;
+#endif
+
     if (!high_bit_depth && CONFIG_H264CHROMA) {
         c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
@@ -2740,6 +2755,7 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
     const int bit_depth      = avctx->bits_per_raw_sample;
     const int high_bit_depth = bit_depth > 8;
 
+#if HAVE_INLINE_ASM
     c->prefetch = prefetch_mmx2;
 
     if (!high_bit_depth) {
@@ -2775,22 +2791,27 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
         c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
         c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
     }
+#endif /* HAVE_INLINE_ASM */
 
     if (CONFIG_H264QPEL) {
+#if HAVE_INLINE_ASM
         SET_QPEL_FUNCS(put_qpel,        0, 16, mmx2, );
         SET_QPEL_FUNCS(put_qpel,        1,  8, mmx2, );
         SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
         SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmx2, );
         SET_QPEL_FUNCS(avg_qpel,        0, 16, mmx2, );
         SET_QPEL_FUNCS(avg_qpel,        1,  8, mmx2, );
+#endif /* HAVE_INLINE_ASM */
 
         if (!high_bit_depth) {
+#if HAVE_INLINE_ASM
             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, mmx2, );
             SET_QPEL_FUNCS(put_h264_qpel, 2,  4, mmx2, );
             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmx2, );
             SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmx2, );
+#endif /* HAVE_INLINE_ASM */
         } else if (bit_depth == 10) {
 #if HAVE_YASM
 #if !ARCH_X86_64
@@ -2804,10 +2825,12 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
 #endif
         }
 
+#if HAVE_INLINE_ASM
         SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
         SET_QPEL_FUNCS(put_2tap_qpel, 1,  8, mmx2, );
         SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
         SET_QPEL_FUNCS(avg_2tap_qpel, 1,  8, mmx2, );
+#endif /* HAVE_INLINE_ASM */
     }
 
 #if HAVE_YASM
@@ -2842,6 +2865,7 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
 {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
+#if HAVE_INLINE_ASM
     c->prefetch = prefetch_3dnow;
 
     if (!high_bit_depth) {
@@ -2899,25 +2923,26 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
         SET_QPEL_FUNCS(avg_2tap_qpel, 1,  8, 3dnow, );
     }
 
-#if HAVE_YASM
-    if (!high_bit_depth && CONFIG_H264CHROMA) {
-        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
-        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
-    }
-#endif
-
     c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
 
 #if HAVE_7REGS
     if (mm_flags & AV_CPU_FLAG_CMOV)
         c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
 #endif
+#endif /* HAVE_INLINE_ASM */
+
+#if HAVE_YASM
+    if (!high_bit_depth && CONFIG_H264CHROMA) {
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
+    }
+#endif
 }
 
 static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
                                 int mm_flags)
 {
-#if HAVE_6REGS
+#if HAVE_6REGS && HAVE_INLINE_ASM
     c->vector_fmul_window  = vector_fmul_window_3dnow2;
 #endif
 }
@@ -2926,6 +2951,7 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
 {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
+#if HAVE_INLINE_ASM
     if (!high_bit_depth) {
         if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
             /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
@@ -2936,31 +2962,35 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
 
     c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
     c->ac3_downmix             = ac3_downmix_sse;
-#if HAVE_YASM
-    c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
-    c->vector_fmul_add     = ff_vector_fmul_add_sse;
-#endif
 
 #if HAVE_6REGS
     c->vector_fmul_window = vector_fmul_window_sse;
 #endif
 
     c->vector_clipf = vector_clipf_sse;
+#endif /* HAVE_INLINE_ASM */
 
 #if HAVE_YASM
+    c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
+    c->vector_fmul_add     = ff_vector_fmul_add_sse;
+
     c->scalarproduct_float          = ff_scalarproduct_float_sse;
     c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
 
     if (!high_bit_depth)
         c->emulated_edge_mc = emulated_edge_mc_sse;
+#if HAVE_INLINE_ASM
     c->gmc = gmc_sse;
 #endif
+#endif
 }
 
 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
                               int mm_flags)
 {
     const int bit_depth      = avctx->bits_per_raw_sample;
+
+#if HAVE_INLINE_ASM
     const int high_bit_depth = bit_depth > 8;
 
     if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
@@ -2988,6 +3018,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
         H264_QPEL_FUNCS(3, 2, sse2);
         H264_QPEL_FUNCS(3, 3, sse2);
     }
+#endif /* HAVE_INLINE_ASM */
 
 #if HAVE_YASM
     if (bit_depth == 10) {
@@ -3029,6 +3060,7 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
     const int bit_depth      = avctx->bits_per_raw_sample;
 
+#if HAVE_INLINE_ASM
     if (!high_bit_depth && CONFIG_H264QPEL) {
         H264_QPEL_FUNCS(1, 0, ssse3);
         H264_QPEL_FUNCS(1, 1, ssse3);
@@ -3043,8 +3075,9 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
         H264_QPEL_FUNCS(3, 2, ssse3);
         H264_QPEL_FUNCS(3, 3, ssse3);
     }
+#endif /* HAVE_INLINE_ASM */
 #if HAVE_YASM
-    else if (bit_depth == 10 && CONFIG_H264QPEL) {
+    if (bit_depth == 10 && CONFIG_H264QPEL) {
         H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
         H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
         H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
@@ -3108,6 +3141,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
     int mm_flags = av_get_cpu_flags();
 
     if (mm_flags & AV_CPU_FLAG_MMX) {
+#if HAVE_INLINE_ASM
         const int idct_algo = avctx->idct_algo;
 
         if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
@@ -3148,6 +3182,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
                 }
             }
         }
+#endif /* HAVE_INLINE_ASM */
 
         dsputil_init_mmx(c, avctx, mm_flags);
     }
diff --git a/libavcodec/x86/h264_qpel_mmx.c b/libavcodec/x86/h264_qpel_mmx.c
index 53c840ee4f..71a1fbeed9 100644
--- a/libavcodec/x86/h264_qpel_mmx.c
+++ b/libavcodec/x86/h264_qpel_mmx.c
@@ -21,6 +21,8 @@
 
 #include "dsputil_mmx.h"
 
+#if HAVE_INLINE_ASM
+
 /***********************************/
 /* motion compensation */
 
@@ -1191,7 +1193,7 @@ H264_MC_816(H264_MC_H, ssse3)
 H264_MC_816(H264_MC_HV, ssse3)
 #endif
 
-
+#endif /* HAVE_INLINE_ASM */
 
 //10bit
 #define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
diff --git a/libavcodec/x86/idct_mmx.c b/libavcodec/x86/idct_mmx.c
index f199941f55..2408ab26ad 100644
--- a/libavcodec/x86/idct_mmx.c
+++ b/libavcodec/x86/idct_mmx.c
@@ -25,6 +25,8 @@
 #include "libavutil/x86_cpu.h"
 #include "dsputil_mmx.h"
 
+#if HAVE_INLINE_ASM
+
 #define ROW_SHIFT 11
 #define COL_SHIFT 6
 
@@ -626,3 +628,5 @@ declare_idct (ff_mmxext_idct, mmxext_table,
 
 declare_idct (ff_mmx_idct, mmx_table,
               mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
index 1b48ab52bb..5d8027fb27 100644
--- a/libavcodec/x86/idct_mmx_xvid.c
+++ b/libavcodec/x86/idct_mmx_xvid.c
@@ -43,6 +43,8 @@
 #include "libavcodec/avcodec.h"
 #include "idct_xvid.h"
 
+#if HAVE_INLINE_ASM
+
 //=============================================================================
 // Macros and other preprocessor constants
 //=============================================================================
@@ -523,3 +525,5 @@ __asm__ volatile(
     DCT_8_INV_COL(8(%0), 8(%0))
     :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
 }
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index fc75a57519..3708f93df8 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -43,6 +43,8 @@
 #include "idct_xvid.h"
 #include "dsputil_mmx.h"
 
+#if HAVE_INLINE_ASM
+
 /**
  * @file
  * @brief SSE2 idct compatible with xvidmmx
@@ -401,3 +403,5 @@ void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
     ff_idct_xvid_sse2(block);
     ff_add_pixels_clamped_mmx(block, dest, line_size);
 }
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 41ee6a74c6..bbb1c8eb57 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -194,10 +194,12 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
     if (mm_flags & AV_CPU_FLAG_MMX) {
         c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
         c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
+#if HAVE_INLINE_ASM
         c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx;
         c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx;
         c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx;
         c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx;
+#endif /* HAVE_INLINE_ASM */
 #if ARCH_X86_32
         QPEL_MC_SET(put_, _mmx)
 #endif
diff --git a/libavcodec/x86/simple_idct_mmx.c b/libavcodec/x86/simple_idct_mmx.c
index db479ce257..20e51a47f4 100644
--- a/libavcodec/x86/simple_idct_mmx.c
+++ b/libavcodec/x86/simple_idct_mmx.c
@@ -23,6 +23,8 @@
 #include "libavcodec/simple_idct.h"
 #include "dsputil_mmx.h"
 
+#if HAVE_INLINE_ASM
+
 /*
 23170.475006
 22725.260826
@@ -1161,3 +1163,5 @@ void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
     idct(block);
     ff_add_pixels_clamped_mmx(block, dest, line_size);
 }
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 0e0bd29a99..46bd9d8f86 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -38,13 +38,11 @@ cextern pb_1
 cextern pb_3
 cextern pb_7
 cextern pb_1F
+cextern pb_80
 cextern pb_81
 
 cextern pw_8
 
-cextern put_signed_pixels_clamped_mmx
-cextern add_pixels_clamped_mmx
-
 SECTION .text
 
 ; this is off by one or two for some cases when filter_limit is greater than 63
@@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
         PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
 %endmacro
 
-%macro vp3_idct_funcs 3
-cglobal vp3_idct_put_%1, 3, %3, %2
+%macro vp3_idct_funcs 1
+cglobal vp3_idct_put_%1, 3, 4, 9
     VP3_IDCT_%1   r2
-%if ARCH_X86_64
-    mov           r3, r2
-    mov           r2, r1
-    mov           r1, r0
-    mov           r0, r3
+
+    movsxdifnidn  r1, r1d
+    mova          m4, [pb_80]
+    lea           r3, [r1*3]
+%assign %%i 0
+%rep 16/mmsize
+    mova          m0, [r2+mmsize*0+%%i]
+    mova          m1, [r2+mmsize*2+%%i]
+    mova          m2, [r2+mmsize*4+%%i]
+    mova          m3, [r2+mmsize*6+%%i]
+    packsswb      m0, [r2+mmsize*1+%%i]
+    packsswb      m1, [r2+mmsize*3+%%i]
+    packsswb      m2, [r2+mmsize*5+%%i]
+    packsswb      m3, [r2+mmsize*7+%%i]
+    paddb         m0, m4
+    paddb         m1, m4
+    paddb         m2, m4
+    paddb         m3, m4
+    movq   [r0     ], m0
+%if mmsize == 8
+    movq   [r0+r1  ], m1
+    movq   [r0+r1*2], m2
+    movq   [r0+r3  ], m3
 %else
-    mov          r0m, r2
-    mov          r1m, r0
-    mov          r2m, r1
+    movhps [r0+r1  ], m0
+    movq   [r0+r1*2], m1
+    movhps [r0+r3  ], m1
 %endif
-%if WIN64
-    call put_signed_pixels_clamped_mmx
-    RET
-%else
-    jmp put_signed_pixels_clamped_mmx
+%if %%i == 0
+    lea           r0, [r0+r1*4]
+%endif
+%if mmsize == 16
+    movq   [r0     ], m2
+    movhps [r0+r1  ], m2
+    movq   [r0+r1*2], m3
+    movhps [r0+r3  ], m3
 %endif
+%assign %%i %%i+64
+%endrep
+    RET
 
-cglobal vp3_idct_add_%1, 3, %3, %2
+cglobal vp3_idct_add_%1, 3, 4, 9
     VP3_IDCT_%1   r2
-%if ARCH_X86_64
-    mov           r3, r2
-    mov           r2, r1
-    mov           r1, r0
-    mov           r0, r3
-%else
-    mov          r0m, r2
-    mov          r1m, r0
-    mov          r2m, r1
+
+    mov           r3, 4
+    pxor          m4, m4
+    movsxdifnidn  r1, r1d
+.loop:
+    movq          m0, [r0]
+    movq          m1, [r0+r1]
+%if mmsize == 8
+    mova          m2, m0
+    mova          m3, m1
 %endif
-%if WIN64
-    call add_pixels_clamped_mmx
-    RET
-%else
-    jmp add_pixels_clamped_mmx
+    punpcklbw     m0, m4
+    punpcklbw     m1, m4
+%if mmsize == 8
+    punpckhbw     m2, m4
+    punpckhbw     m3, m4
+%endif
+    paddsw        m0, [r2+ 0]
+    paddsw        m1, [r2+16]
+%if mmsize == 8
+    paddsw        m2, [r2+ 8]
+    paddsw        m3, [r2+24]
+    packuswb      m0, m2
+    packuswb      m1, m3
+%else ; mmsize == 16
+    packuswb      m0, m1
 %endif
+    movq     [r0   ], m0
+%if mmsize == 8
+    movq     [r0+r1], m1
+%else ; mmsize == 16
+    movhps   [r0+r1], m0
+%endif
+    lea           r0, [r0+r1*2]
+    add           r2, 32
+    dec           r3
+    jg .loop
+    RET
 %endmacro
 
-%if ARCH_X86_64
-%define REGS 4
-%else
-%define REGS 3
-%endif
 INIT_MMX
-vp3_idct_funcs mmx,  0, REGS
+vp3_idct_funcs mmx
 INIT_XMM
-vp3_idct_funcs sse2, 9, REGS
-%undef REGS
+vp3_idct_funcs sse2
 
 %macro DC_ADD 0
     movq          m2, [r0     ]
author	Michael Niedermayer <michaelni@gmx.at>	2012-07-26 21:36:03 +0200
committer	Michael Niedermayer <michaelni@gmx.at>	2012-07-26 21:37:15 +0200
commit	7333798c85837f1cf175f39bc4acb5664fa6cacc (patch)
tree	60036638a0962b3cb966d62da2eda81f93ac3267 /libavcodec/x86
parent	307a20cca216356aec30f5bb102c633169cbc0c1 (diff)
parent	44dc9c6af0377faf2a99889d1f949e32a1102e84 (diff)