idct permutation cleanup, idct can be selected per context now

fixing some threadunsafe code Originally committed as revision 980 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Michael Niedermayer <michaelni@gmx.at> 2002-09-29 22:44:22 +0000
committer: Michael Niedermayer <michaelni@gmx.at> 2002-09-29 22:44:22 +0000
commit: 2ad1516a6c7180d4f9343c0f07120eaec5130d6e (patch)
tree: 38dfb52da33739e269f30177e8b46c86067dbc67 /libavcodec/i386
parent: f9bb4bdffcbde7362db2a0e041a2893dde0ace6f (diff)
4 files changed, 241 insertions, 151 deletions
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index eaec8fe45a..708d0b091b 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -20,7 +20,6 @@
  */
 
 #include "../dsputil.h"
-#include "../simple_idct.h"
 
 int mm_flags; /* multimedia extension flags */
 
@@ -44,10 +43,6 @@ int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
 int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
 int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
 
-/* external functions, from idct_mmx.c */
-void ff_mmx_idct(DCTELEM *block);
-void ff_mmxext_idct(DCTELEM *block);
-
 /* pixel operations */
 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
@@ -588,17 +583,6 @@ void dsputil_init_mmx(void)
             avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
             avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
         }
-
-        /* idct */
-        if (mm_flags & MM_MMXEXT) {
-            ff_idct = ff_mmxext_idct;
-        } else {
-            ff_idct = ff_mmx_idct;
-        }
-#ifdef SIMPLE_IDCT
-//	ff_idct = simple_idct;
-	ff_idct = simple_idct_mmx;
-#endif
     }
 
 #if 0
@@ -637,28 +621,6 @@ void dsputil_init_mmx(void)
 #endif
 }
 
-void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block);
-
-/**
- * this will send coeff matrixes which would have different results for the 16383 type MMX vs C IDCTs to the C IDCT
- */ 
-void bit_exact_idct_put(UINT8 *dest, int line_size, INT16 *block){
-    if(   block[0]>1022 && block[1]==0 && block[4 ]==0 && block[5 ]==0
-       && block[8]==0   && block[9]==0 && block[12]==0 && block[13]==0){
-        int16_t tmp[64];
-        int i;
-
-        for(i=0; i<64; i++)
-            tmp[i]= block[i];
-        for(i=0; i<64; i++)
-            block[i]= tmp[block_permute_op(i)];
-        
-        simple_idct_put(dest, line_size, block);
-    }
-    else
-        gen_idct_put(dest, line_size, block);
-}
-
 /* remove any non bit exact operation (testing purpose). NOTE that
    this function should be kept as small as possible because it is
    always difficult to test automatically non bit exact cases. */
@@ -682,9 +644,5 @@ void dsputil_set_bit_exact_mmx(void)
             pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
             pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
         }
-#ifdef SIMPLE_IDCT
-        if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx)
-            ff_idct_put= bit_exact_idct_put;
-#endif
     }
 }
diff --git a/libavcodec/i386/mpegvideo_mmx.c b/libavcodec/i386/mpegvideo_mmx.c
index fc5ca55f80..48d6fc5b60 100644
--- a/libavcodec/i386/mpegvideo_mmx.c
+++ b/libavcodec/i386/mpegvideo_mmx.c
@@ -23,53 +23,24 @@
 #include "../dsputil.h"
 #include "../mpegvideo.h"
 #include "../avcodec.h"
-
-extern UINT8 zigzag_end[64];
+#include "../simple_idct.h"
+
+/* Input permutation for the simple_idct_mmx */
+static UINT8 simple_mmx_permutation[64]={
+	0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
+	0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
+	0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
+	0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
+	0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
+	0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
+	0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
+	0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
+};
 
 extern UINT8 zigzag_direct_noperm[64];
 extern UINT16 inv_zigzag_direct16[64];
 extern UINT32 inverse[256];
 
-#if 0
-
-/* XXX: GL: I don't understand why this function needs optimization
-   (it is called only once per frame!), so I disabled it */
-
-void MPV_frame_start(MpegEncContext *s)
-{
-    if (s->pict_type == B_TYPE) {
-	__asm __volatile(
-	    "movl	(%1), %%eax\n\t"
-	    "movl	4(%1), %%edx\n\t"
-	    "movl	8(%1), %%ecx\n\t"
-	    "movl	%%eax, (%0)\n\t"
-	    "movl	%%edx, 4(%0)\n\t"
-	    "movl	%%ecx, 8(%0)\n\t"
-	    :
-	    :"r"(s->current_picture), "r"(s->aux_picture)
-	    :"eax","edx","ecx","memory");
-    } else {
-            /* swap next and last */
-	__asm __volatile(
-	    "movl	(%1), %%eax\n\t"
-	    "movl	4(%1), %%edx\n\t"
-	    "movl	8(%1), %%ecx\n\t"
-	    "xchgl	(%0), %%eax\n\t"
-	    "xchgl	4(%0), %%edx\n\t"
-	    "xchgl	8(%0), %%ecx\n\t"
-	    "movl	%%eax, (%1)\n\t"
-	    "movl	%%edx, 4(%1)\n\t"
-	    "movl	%%ecx, 8(%1)\n\t"
-	    "movl	%%eax, (%2)\n\t"
-	    "movl	%%edx, 4(%2)\n\t"
-	    "movl	%%ecx, 8(%2)\n\t"
-	    :
-	    :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture)
-	    :"eax","edx","ecx","memory");
-    }
-}
-#endif
-
 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
 
@@ -77,36 +48,26 @@ static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x000
 static void dct_unquantize_h263_mmx(MpegEncContext *s,
                                   DCTELEM *block, int n, int qscale)
 {
-    int i, level, qmul, qadd, nCoeffs;
-    
-    qmul = s->qscale << 1;
-    if (s->h263_aic && s->mb_intra)
-        qadd = 0;
-    else
-        qadd = (s->qscale - 1) | 1;
+    int level, qmul, qadd, nCoeffs;
+
+    qmul = qscale << 1;
+    qadd = (qscale - 1) | 1;
 
+    assert(s->block_last_index[n]>=0);
+        
     if (s->mb_intra) {
         if (!s->h263_aic) {
             if (n < 4)
-                block[0] = block[0] * s->y_dc_scale;
+                level = block[0] * s->y_dc_scale;
             else
-                block[0] = block[0] * s->c_dc_scale;
+                level = block[0] * s->c_dc_scale;
+        }else{
+            qadd = 0;
+            level= block[0];
         }
-        for(i=1; i<8; i++) {
-            level = block[i];
-            if (level) {
-    	        if (level < 0) {
-                    level = level * qmul - qadd;
-                } else {
-                    level = level * qmul + qadd;
-                }
-                block[i] = level;
-            }
-        }
-        nCoeffs=64;
+        nCoeffs=63;
     } else {
-        i = 0;
-        nCoeffs= zigzag_end[ s->block_last_index[n] ];
+        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
     }
 //printf("%d %d  ", qmul, qadd);
 asm volatile(
@@ -152,10 +113,12 @@ asm volatile(
 		"movq %%mm1, 8(%0, %3)		\n\t"
 
 		"addl $16, %3			\n\t"
-		"js 1b				\n\t"
-		::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(i-nCoeffs))
+		"jng 1b				\n\t"
+		::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
 		: "memory"
 	);
+        if(s->mb_intra)
+            block[0]= level;
 }
 
 
@@ -193,9 +156,10 @@ static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
 {
     int nCoeffs;
     const UINT16 *quant_matrix;
-    
-    if(s->alternate_scan) nCoeffs= 64;
-    else nCoeffs= zigzag_end[ s->block_last_index[n] ];
+
+    assert(s->block_last_index[n]>=0);
+
+    nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
     if (s->mb_intra) {
         int block0;
@@ -312,6 +276,7 @@ asm volatile(
 		: "%eax", "memory"
 	);
     }
+
 }
 
 static void dct_unquantize_mpeg2_mmx(MpegEncContext *s,
@@ -320,8 +285,10 @@ static void dct_unquantize_mpeg2_mmx(MpegEncContext *s,
     int nCoeffs;
     const UINT16 *quant_matrix;
     
-    if(s->alternate_scan) nCoeffs= 64;
-    else nCoeffs= zigzag_end[ s->block_last_index[n] ];
+    assert(s->block_last_index[n]>=0);
+
+    if(s->alternate_scan) nCoeffs= 63; //FIXME
+    else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
 
     if (s->mb_intra) {
         int block0;
@@ -371,7 +338,7 @@ asm volatile(
 		"movq %%mm5, 8(%0, %%eax)	\n\t"
 
 		"addl $16, %%eax		\n\t"
-		"js 1b				\n\t"
+		"jng 1b				\n\t"
 		::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
 		: "%eax", "memory"
 	);    
@@ -427,7 +394,7 @@ asm volatile(
 		"movq %%mm5, 8(%0, %%eax)	\n\t"
 
 		"addl $16, %%eax		\n\t"
-		"js 1b				\n\t"
+		"jng 1b				\n\t"
                 "movd 124(%0, %3), %%mm0	\n\t"
                 "movq %%mm7, %%mm6		\n\t"
                 "psrlq $32, %%mm7		\n\t"
@@ -534,12 +501,6 @@ static void draw_edges_mmx(UINT8 *buf, int wrap, int width, int height, int w)
     }
 }
 
-static volatile int esp_temp;
-
-void unused_var_warning_killer(){
-	esp_temp++;
-}
-
 #undef HAVE_MMX2
 #define RENAME(a) a ## _MMX
 #include "mpegvideo_mmx_template.c"
@@ -549,10 +510,40 @@ void unused_var_warning_killer(){
 #define RENAME(a) a ## _MMX2
 #include "mpegvideo_mmx_template.c"
 
+/* external functions, from idct_mmx.c */
+void ff_mmx_idct(DCTELEM *block);
+void ff_mmxext_idct(DCTELEM *block);
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+   converted */
+static void ff_libmpeg2mmx_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
+{
+    ff_mmx_idct (block);
+    put_pixels_clamped(block, dest, line_size);
+}
+static void ff_libmpeg2mmx_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
+{
+    ff_mmx_idct (block);
+    add_pixels_clamped(block, dest, line_size);
+}
+static void ff_libmpeg2mmx2_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
+{
+    ff_mmxext_idct (block);
+    put_pixels_clamped(block, dest, line_size);
+}
+static void ff_libmpeg2mmx2_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
+{
+    ff_mmxext_idct (block);
+    add_pixels_clamped(block, dest, line_size);
+}
+
 void MPV_common_init_mmx(MpegEncContext *s)
 {
     if (mm_flags & MM_MMX) {
-        const int dct_algo= s->avctx->dct_algo;
+        int i;
+        const int dct_algo = s->avctx->dct_algo;
+        const int idct_algo= s->avctx->idct_algo;
+        
         s->dct_unquantize_h263 = dct_unquantize_h263_mmx;
         s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx;
         s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx;
@@ -568,5 +559,22 @@ void MPV_common_init_mmx(MpegEncContext *s)
                 s->dct_quantize= dct_quantize_MMX;
             }
         }
+        
+        if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
+            s->idct_put= ff_simple_idct_put_mmx;
+            s->idct_add= ff_simple_idct_add_mmx;
+            for(i=0; i<64; i++)
+                s->idct_permutation[i]= simple_mmx_permutation[i];
+        }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
+            if(mm_flags & MM_MMXEXT){
+                s->idct_put= ff_libmpeg2mmx2_idct_put;
+                s->idct_add= ff_libmpeg2mmx2_idct_add;
+            }else{
+                s->idct_put= ff_libmpeg2mmx_idct_put;
+                s->idct_add= ff_libmpeg2mmx_idct_add;
+            }
+            for(i=0; i<64; i++)
+                s->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
+        }
     }
 }
diff --git a/libavcodec/i386/mpegvideo_mmx_template.c b/libavcodec/i386/mpegvideo_mmx_template.c
index 1eed906c63..8fda458a21 100644
--- a/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/libavcodec/i386/mpegvideo_mmx_template.c
@@ -189,31 +189,143 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         );
     }
 
-    if(s->mb_intra) temp_block[0]= level; //FIXME move afer permute
-        
-// last_non_zero_p1=64;       
-    /* permute for IDCT */
-    asm volatile(
-        "movl %0, %%eax			\n\t"
-	"pushl %%ebp			\n\t"
-	"movl %%esp, " MANGLE(esp_temp) "\n\t"
-	"1:				\n\t"
-	"movzbl (%1, %%eax), %%ebx	\n\t"
-	"movzbl 1(%1, %%eax), %%ebp	\n\t"
-	"movw (%2, %%ebx, 2), %%cx	\n\t"
-	"movw (%2, %%ebp, 2), %%sp	\n\t"
-	"movzbl " MANGLE(permutation) "(%%ebx), %%ebx\n\t"
-	"movzbl " MANGLE(permutation) "(%%ebp), %%ebp\n\t"
-	"movw %%cx, (%3, %%ebx, 2)	\n\t"
-	"movw %%sp, (%3, %%ebp, 2)	\n\t"
-	"addl $2, %%eax			\n\t"
-	" js 1b				\n\t"
-	"movl " MANGLE(esp_temp) ", %%esp\n\t"
-	"popl %%ebp			\n\t"
-	: 
-	: "g" (-last_non_zero_p1), "d" (zigzag_direct_noperm+last_non_zero_p1), "S" (temp_block), "D" (block)
-	: "%eax", "%ebx", "%ecx"
-	);
+    if(s->mb_intra) block[0]= level;
+    else            block[0]= temp_block[0];
+
+    if(s->idct_permutation[1]==8){
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; 
+        block[0x20] = temp_block[0x10]; 
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02]; 
+        block[0x09] = temp_block[0x03]; 
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11]; 
+        block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20]; 
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12]; 
+        block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04]; 
+        block[0x0C] = temp_block[0x05]; 
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13]; 
+        block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21]; 
+        block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30]; 
+        block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22]; 
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14]; 
+        block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06]; 
+        block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E]; 
+        block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C]; 
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A]; 
+        block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38]; 
+        block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32]; 
+        block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24]; 
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16]; 
+        block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17]; 
+        block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25]; 
+        block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33]; 
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; 
+        block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D]; 
+        block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; 
+        block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E]; 
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C]; 
+        block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; 
+        block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; 
+        block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    }else if(s->idct_permutation[1]==4){
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x04] = temp_block[0x01]; 
+        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; 
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02]; 
+        block[0x05] = temp_block[0x03]; 
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11]; 
+        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; 
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x1C] = temp_block[0x19]; 
+        block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B]; 
+        block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05]; 
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13]; 
+        block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21]; 
+        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; 
+        block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22]; 
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14]; 
+        block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06]; 
+        block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E]; 
+        block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C]; 
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A]; 
+        block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38]; 
+        block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32]; 
+        block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24]; 
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16]; 
+        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; 
+        block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25]; 
+        block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33]; 
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B]; 
+        block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D]; 
+            block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; 
+        block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E]; 
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C]; 
+        block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36]; 
+        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; 
+        block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    }else{
+        if(last_non_zero_p1 <= 1) goto end;
+        block[0x01] = temp_block[0x01]; 
+        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; 
+        if(last_non_zero_p1 <= 4) goto end;
+        block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02]; 
+        block[0x03] = temp_block[0x03]; 
+        if(last_non_zero_p1 <= 7) goto end;
+        block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11]; 
+        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; 
+        if(last_non_zero_p1 <= 11) goto end;
+        block[0x19] = temp_block[0x19]; 
+        block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B]; 
+        block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05]; 
+        if(last_non_zero_p1 <= 16) goto end;
+        block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13]; 
+        block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21]; 
+        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; 
+        block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22]; 
+        if(last_non_zero_p1 <= 24) goto end;
+        block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14]; 
+        block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06]; 
+        block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E]; 
+        block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C]; 
+        if(last_non_zero_p1 <= 32) goto end;
+        block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A]; 
+        block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38]; 
+        block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32]; 
+        block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24]; 
+        if(last_non_zero_p1 <= 40) goto end;
+        block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16]; 
+        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; 
+        block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25]; 
+        block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33]; 
+        if(last_non_zero_p1 <= 48) goto end;
+        block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; 
+        block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D]; 
+        block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; 
+        block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E]; 
+        if(last_non_zero_p1 <= 56) goto end;
+        block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C]; 
+        block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36]; 
+        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; 
+        block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
+    }
+    end:
 /*
     for(i=0; i<last_non_zero_p1; i++)
     {
@@ -221,7 +333,6 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
        block[block_permute_op(j)]= temp_block[j];
     }
 */
-//block_permute(block);
 
     return last_non_zero_p1 - 1;
 }
diff --git a/libavcodec/i386/simple_idct_mmx.c b/libavcodec/i386/simple_idct_mmx.c
index 4f19cc20ae..0c859862ee 100644
--- a/libavcodec/i386/simple_idct_mmx.c
+++ b/libavcodec/i386/simple_idct_mmx.c
@@ -1291,7 +1291,20 @@ Temp
 	);
 }
 
-void simple_idct_mmx(int16_t *block)
+void ff_simple_idct_mmx(int16_t *block)
 {
-	idct(block);
+    idct(block);
+}
+
+//FIXME merge add/put into the idct
+
+void ff_simple_idct_put_mmx(UINT8 *dest, int line_size, DCTELEM *block)
+{
+    idct(block);
+    put_pixels_clamped(block, dest, line_size);
+}
+void ff_simple_idct_add_mmx(UINT8 *dest, int line_size, DCTELEM *block)
+{
+    idct(block);
+    add_pixels_clamped(block, dest, line_size);
 }
author	Michael Niedermayer <michaelni@gmx.at>	2002-09-29 22:44:22 +0000
committer	Michael Niedermayer <michaelni@gmx.at>	2002-09-29 22:44:22 +0000
commit	2ad1516a6c7180d4f9343c0f07120eaec5130d6e (patch)
tree	38dfb52da33739e269f30177e8b46c86067dbc67 /libavcodec/i386
parent	f9bb4bdffcbde7362db2a0e041a2893dde0ace6f (diff)