Utility functions (CRC calc & float->int converters)

[imported from MPlayer, based on a52dec's libao] Originally committed as revision 1780 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Arpi <arpi@thot.banki.hu> 2003-04-16 20:03:07 +0000
committer: Arpi <arpi@thot.banki.hu> 2003-04-16 20:03:07 +0000
commit: 1a7c3c85622fa0ff48127cfe38ece15e27bfb17c (patch)
tree: d2129e00537036d9851375e045518d4e2b1d4597 /libavcodec
parent: 6814a25c676ae3f0eb73a2d7180b7fe9e62a62ec (diff)
7 files changed, 858 insertions, 0 deletions
diff --git a/libavcodec/liba52/a52_internal.h b/libavcodec/liba52/a52_internal.h
index a158227699..5235704ffe 100644
--- a/libavcodec/liba52/a52_internal.h
+++ b/libavcodec/liba52/a52_internal.h
@@ -118,6 +118,8 @@ void a52_upmix (sample_t * samples, int acmod, int output);
 void a52_imdct_init (uint32_t mm_accel);
 void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias);
 void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias);
+//extern void (* a52_imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
+//extern void (* a52_imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
 
 #define ROUND(x) ((int)((x) + ((x) > 0 ? 0.5 : -0.5)))
 
diff --git a/libavcodec/liba52/a52_util.h b/libavcodec/liba52/a52_util.h
new file mode 100644
index 0000000000..121393ec19
--- /dev/null
+++ b/libavcodec/liba52/a52_util.h
@@ -0,0 +1,32 @@
+/*
+ * a52_util.h
+ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of a52dec, a free ATSC A-52 stream decoder.
+ * See http://liba52.sourceforge.net/ for updates.
+ *
+ * a52dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * a52dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef A52_UTIL_H
+#define A52_UTIL_H
+
+uint16_t a52_crc16_block(uint8_t *data,uint32_t num_bytes);
+
+void* a52_resample_init(uint32_t mm_accel,int flags,int chans);
+extern int (* a52_resample) (float * _f, int16_t * s16);
+
+#endif /* A52_H */
diff --git a/libavcodec/liba52/crc.c b/libavcodec/liba52/crc.c
new file mode 100644
index 0000000000..6698155bd4
--- /dev/null
+++ b/libavcodec/liba52/crc.c
@@ -0,0 +1,73 @@
+/* 
+ *    crc.c
+ *
+ *	Copyright (C) Aaron Holtzman - May 1999
+ *
+ *  This file is part of ac3dec, a free Dolby AC-3 stream decoder.
+ *	
+ *  ac3dec is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *   
+ *  ac3dec is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *   
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+static const uint16_t crc_lut[256] = 
+{
+	0x0000,0x8005,0x800f,0x000a,0x801b,0x001e,0x0014,0x8011,
+	0x8033,0x0036,0x003c,0x8039,0x0028,0x802d,0x8027,0x0022,
+	0x8063,0x0066,0x006c,0x8069,0x0078,0x807d,0x8077,0x0072,
+	0x0050,0x8055,0x805f,0x005a,0x804b,0x004e,0x0044,0x8041,
+	0x80c3,0x00c6,0x00cc,0x80c9,0x00d8,0x80dd,0x80d7,0x00d2,
+	0x00f0,0x80f5,0x80ff,0x00fa,0x80eb,0x00ee,0x00e4,0x80e1,
+	0x00a0,0x80a5,0x80af,0x00aa,0x80bb,0x00be,0x00b4,0x80b1,
+	0x8093,0x0096,0x009c,0x8099,0x0088,0x808d,0x8087,0x0082,
+	0x8183,0x0186,0x018c,0x8189,0x0198,0x819d,0x8197,0x0192,
+	0x01b0,0x81b5,0x81bf,0x01ba,0x81ab,0x01ae,0x01a4,0x81a1,
+	0x01e0,0x81e5,0x81ef,0x01ea,0x81fb,0x01fe,0x01f4,0x81f1,
+	0x81d3,0x01d6,0x01dc,0x81d9,0x01c8,0x81cd,0x81c7,0x01c2,
+	0x0140,0x8145,0x814f,0x014a,0x815b,0x015e,0x0154,0x8151,
+	0x8173,0x0176,0x017c,0x8179,0x0168,0x816d,0x8167,0x0162,
+	0x8123,0x0126,0x012c,0x8129,0x0138,0x813d,0x8137,0x0132,
+	0x0110,0x8115,0x811f,0x011a,0x810b,0x010e,0x0104,0x8101,
+	0x8303,0x0306,0x030c,0x8309,0x0318,0x831d,0x8317,0x0312,
+	0x0330,0x8335,0x833f,0x033a,0x832b,0x032e,0x0324,0x8321,
+	0x0360,0x8365,0x836f,0x036a,0x837b,0x037e,0x0374,0x8371,
+	0x8353,0x0356,0x035c,0x8359,0x0348,0x834d,0x8347,0x0342,
+	0x03c0,0x83c5,0x83cf,0x03ca,0x83db,0x03de,0x03d4,0x83d1,
+	0x83f3,0x03f6,0x03fc,0x83f9,0x03e8,0x83ed,0x83e7,0x03e2,
+	0x83a3,0x03a6,0x03ac,0x83a9,0x03b8,0x83bd,0x83b7,0x03b2,
+	0x0390,0x8395,0x839f,0x039a,0x838b,0x038e,0x0384,0x8381,
+	0x0280,0x8285,0x828f,0x028a,0x829b,0x029e,0x0294,0x8291,
+	0x82b3,0x02b6,0x02bc,0x82b9,0x02a8,0x82ad,0x82a7,0x02a2,
+	0x82e3,0x02e6,0x02ec,0x82e9,0x02f8,0x82fd,0x82f7,0x02f2,
+	0x02d0,0x82d5,0x82df,0x02da,0x82cb,0x02ce,0x02c4,0x82c1,
+	0x8243,0x0246,0x024c,0x8249,0x0258,0x825d,0x8257,0x0252,
+	0x0270,0x8275,0x827f,0x027a,0x826b,0x026e,0x0264,0x8261,
+	0x0220,0x8225,0x822f,0x022a,0x823b,0x023e,0x0234,0x8231,
+	0x8213,0x0216,0x021c,0x8219,0x0208,0x820d,0x8207,0x0202
+};
+
+uint16_t a52_crc16_block(uint8_t *data,uint32_t num_bytes)
+{
+	uint32_t i;
+	uint16_t state=0;
+
+	for(i=0;i<num_bytes;i++)
+		state = crc_lut[data[i] ^ (state>>8)] ^ (state<<8);
+
+	return state;
+}
diff --git a/libavcodec/liba52/mm_accel.h b/libavcodec/liba52/mm_accel.h
index 25258c3683..8afbd354cd 100644
--- a/libavcodec/liba52/mm_accel.h
+++ b/libavcodec/liba52/mm_accel.h
@@ -31,6 +31,11 @@
 #define MM_ACCEL_X86_MMX	0x80000000
 #define MM_ACCEL_X86_3DNOW	0x40000000
 #define MM_ACCEL_X86_MMXEXT	0x20000000
+#define MM_ACCEL_X86_SSE	0x10000000
+#define MM_ACCEL_X86_3DNOWEXT	0x08000000
+
+/* PPC accelerations */
+#define MM_ACCEL_PPC_ALTIVEC	0x00010000
 
 uint32_t mm_accel (void);
 
diff --git a/libavcodec/liba52/resample.c b/libavcodec/liba52/resample.c
new file mode 100644
index 0000000000..284cbbe78d
--- /dev/null
+++ b/libavcodec/liba52/resample.c
@@ -0,0 +1,45 @@
+
+// a52_resample_init should find the requested converter (from type flags ->
+// given number of channels) and set up some function pointers...
+
+// a52_resample() should do the conversion.
+
+#include <inttypes.h>
+#include <stdio.h>
+#include "a52.h"
+#include "mm_accel.h"
+#include "config.h"
+#include "../libpostproc/mangle.h"
+
+int (* a52_resample) (float * _f, int16_t * s16)=NULL;
+
+#include "resample_c.c"
+
+#ifdef ARCH_X86
+#include "resample_mmx.c"
+#endif
+
+void* a52_resample_init(uint32_t mm_accel,int flags,int chans){
+void* tmp;
+
+#ifdef ARCH_X86
+    if(mm_accel&MM_ACCEL_X86_MMX){
+	tmp=a52_resample_MMX(flags,chans);
+	if(tmp){
+	    if(a52_resample==NULL) fprintf(stderr, "Using MMX optimized resampler\n");
+	    a52_resample=tmp;
+	    return tmp;
+	}
+    }
+#endif
+
+    tmp=a52_resample_C(flags,chans);
+    if(tmp){
+	if(a52_resample==NULL) fprintf(stderr, "No accelerated resampler found\n");
+	a52_resample=tmp;
+	return tmp;
+    }
+    
+    fprintf(stderr, "Unimplemented resampler for mode 0x%X -> %d channels conversion - Contact MPlayer developers!\n", flags, chans);
+    return NULL;
+}
diff --git a/libavcodec/liba52/resample_c.c b/libavcodec/liba52/resample_c.c
new file mode 100644
index 0000000000..a618ec6e9e
--- /dev/null
+++ b/libavcodec/liba52/resample_c.c
@@ -0,0 +1,183 @@
+// this code is based on a52dec/libao/audio_out_oss.c
+
+static inline int16_t convert (int32_t i)
+{
+    if (i > 0x43c07fff)
+	return 32767;
+    else if (i < 0x43bf8000)
+	return -32768;
+    else
+	return i - 0x43c00000;
+}
+
+static int a52_resample_MONO_to_5_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0;
+	    s16[5*i+4] = convert (f[i]);
+	}
+    return 5*256;
+}
+
+static int a52_resample_MONO_to_1_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[i] = convert (f[i]);
+	}
+    return 1*256;
+}
+
+static int a52_resample_STEREO_to_2_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[2*i] = convert (f[i]);
+	    s16[2*i+1] = convert (f[i+256]);
+	}
+    return 2*256;
+}
+
+static int a52_resample_3F_to_5_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[5*i] = convert (f[i]);
+	    s16[5*i+1] = convert (f[i+512]);
+	    s16[5*i+2] = s16[5*i+3] = 0;
+	    s16[5*i+4] = convert (f[i+256]);
+	}
+    return 5*256;
+}
+
+static int a52_resample_2F_2R_to_4_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[4*i] = convert (f[i]);
+	    s16[4*i+1] = convert (f[i+256]);
+	    s16[4*i+2] = convert (f[i+512]);
+	    s16[4*i+3] = convert (f[i+768]);
+	}
+    return 4*256;
+}
+
+static int a52_resample_3F_2R_to_5_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[5*i] = convert (f[i]);
+	    s16[5*i+1] = convert (f[i+512]);
+	    s16[5*i+2] = convert (f[i+768]);
+	    s16[5*i+3] = convert (f[i+1024]);
+	    s16[5*i+4] = convert (f[i+256]);
+	}
+    return 5*256;
+}
+
+static int a52_resample_MONO_LFE_to_6_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0;
+	    s16[6*i+4] = convert (f[i+256]);
+	    s16[6*i+5] = convert (f[i]);
+	}
+    return 6*256;
+}
+
+static int a52_resample_STEREO_LFE_to_6_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[6*i] = convert (f[i+256]);
+	    s16[6*i+1] = convert (f[i+512]);
+	    s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0;
+	    s16[6*i+5] = convert (f[i]);
+	}
+    return 6*256;
+}
+
+static int a52_resample_3F_LFE_to_6_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[6*i] = convert (f[i+256]);
+	    s16[6*i+1] = convert (f[i+768]);
+	    s16[6*i+2] = s16[6*i+3] = 0;
+	    s16[6*i+4] = convert (f[i+512]);
+	    s16[6*i+5] = convert (f[i]);
+	}
+    return 6*256;
+}
+
+static int a52_resample_2F_2R_LFE_to_6_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[6*i] = convert (f[i+256]);
+	    s16[6*i+1] = convert (f[i+512]);
+	    s16[6*i+2] = convert (f[i+768]);
+	    s16[6*i+3] = convert (f[i+1024]);
+	    s16[6*i+4] = 0;
+	    s16[6*i+5] = convert (f[i]);
+	}
+    return 6*256;
+}
+
+static int a52_resample_3F_2R_LFE_to_6_C(float * _f, int16_t * s16){
+    int i;
+    int32_t * f = (int32_t *) _f;
+	for (i = 0; i < 256; i++) {
+	    s16[6*i] = convert (f[i+256]);
+	    s16[6*i+1] = convert (f[i+768]);
+	    s16[6*i+2] = convert (f[i+1024]);
+	    s16[6*i+3] = convert (f[i+1280]);
+	    s16[6*i+4] = convert (f[i+512]);
+	    s16[6*i+5] = convert (f[i]);
+	}
+    return 6*256;
+}
+
+
+static void* a52_resample_C(int flags, int ch){
+    switch (flags) {
+    case A52_MONO:
+	if(ch==5) return a52_resample_MONO_to_5_C;
+	if(ch==1) return a52_resample_MONO_to_1_C;
+	break;
+    case A52_CHANNEL:
+    case A52_STEREO:
+    case A52_DOLBY:
+	if(ch==2) return a52_resample_STEREO_to_2_C;
+	break;
+    case A52_3F:
+	if(ch==5) return a52_resample_3F_to_5_C;
+	break;
+    case A52_2F2R:
+	if(ch==4) return a52_resample_2F_2R_to_4_C;
+	break;
+    case A52_3F2R:
+	if(ch==5) return a52_resample_3F_2R_to_5_C;
+	break;
+    case A52_MONO | A52_LFE:
+	if(ch==6) return a52_resample_MONO_LFE_to_6_C;
+	break;
+    case A52_CHANNEL | A52_LFE:
+    case A52_STEREO | A52_LFE:
+    case A52_DOLBY | A52_LFE:
+	if(ch==6) return a52_resample_STEREO_LFE_to_6_C;
+	break;
+    case A52_3F | A52_LFE:
+	if(ch==6) return a52_resample_3F_LFE_to_6_C;
+	break;
+    case A52_2F2R | A52_LFE:
+	if(ch==6) return a52_resample_2F_2R_LFE_to_6_C;
+	break;
+    case A52_3F2R | A52_LFE:
+	if(ch==6) return a52_resample_3F_2R_LFE_to_6_C;
+	break;
+    }
+    return NULL;
+}
diff --git a/libavcodec/liba52/resample_mmx.c b/libavcodec/liba52/resample_mmx.c
new file mode 100644
index 0000000000..a4079798f7
--- /dev/null
+++ b/libavcodec/liba52/resample_mmx.c
@@ -0,0 +1,518 @@
+
+// MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL)
+
+/* optimization TODO / NOTES 
+    movntq is slightly faster (0.5% with the current test.c benchmark) 
+	(but thats just test.c so that needs to be testd in reallity)
+	and it would mean (C / MMX2 / MMX / 3DNOW) versions 
+*/
+
+static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL;
+static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL;
+static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL;
+static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL;
+
+static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){
+    int32_t * f = (int32_t *) _f;
+	asm volatile(
+		"movl $-512, %%esi		\n\t"
+		"movq "MANGLE(magicF2W)", %%mm7	\n\t"
+		"movq "MANGLE(wm1100)", %%mm3	\n\t"
+		"movq "MANGLE(wm0101)", %%mm4	\n\t"
+		"movq "MANGLE(wm1010)", %%mm5	\n\t"
+		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq (%1, %%esi, 2), %%mm0	\n\t"
+		"movq 8(%1, %%esi, 2), %%mm1	\n\t"
+		"leal (%%esi, %%esi, 4), %%edi	\n\t"
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"packssdw %%mm1, %%mm0		\n\t"
+		"movq %%mm0, %%mm1		\n\t"
+		"pand %%mm4, %%mm0		\n\t"
+		"pand %%mm5, %%mm1		\n\t"
+		"movq %%mm6, (%0, %%edi)	\n\t" // 0 0 0 0
+		"movd %%mm0, 8(%0, %%edi)	\n\t" // A 0
+		"pand %%mm3, %%mm0		\n\t"
+		"movd %%mm6, 12(%0, %%edi)	\n\t" // 0 0
+		"movd %%mm1, 16(%0, %%edi)	\n\t" // 0 B
+		"pand %%mm3, %%mm1		\n\t"
+		"movd %%mm6, 20(%0, %%edi)	\n\t" // 0 0
+		"movq %%mm0, 24(%0, %%edi)	\n\t" // 0 0 C 0
+		"movq %%mm1, 32(%0, %%edi)	\n\t" // 0 0 0 B
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1280), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+    return 5*256;
+}
+
+static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){
+    int32_t * f = (int32_t *) _f;
+/* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it
+#ifdef HAVE_SSE
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"1:				\n\t"
+		"cvtps2pi (%1, %%esi), %%mm0	\n\t"
+		"cvtps2pi 1024(%1, %%esi), %%mm2\n\t"
+		"movq %%mm0, %%mm1		\n\t"
+		"punpcklwd %%mm2, %%mm0		\n\t"
+		"punpckhwd %%mm2, %%mm1		\n\t"
+		"movq %%mm0, (%0, %%esi)	\n\t"
+		"movq %%mm1, 8(%0, %%esi)	\n\t"
+		"addl $16, %%esi		\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+512), "r" (f+256)
+		:"%esi", "memory"
+	);*/
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq "MANGLE(magicF2W)", %%mm7	\n\t"
+		"1:				\n\t"
+		"movq (%1, %%esi), %%mm0	\n\t"
+		"movq 8(%1, %%esi), %%mm1	\n\t"
+		"movq 1024(%1, %%esi), %%mm2	\n\t"
+		"movq 1032(%1, %%esi), %%mm3	\n\t"
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"packssdw %%mm1, %%mm0		\n\t"
+		"packssdw %%mm3, %%mm2		\n\t"
+		"movq %%mm0, %%mm1		\n\t"
+		"punpcklwd %%mm2, %%mm0		\n\t"
+		"punpckhwd %%mm2, %%mm1		\n\t"
+		"movq %%mm0, (%0, %%esi)	\n\t"
+		"movq %%mm1, 8(%0, %%esi)	\n\t"
+		"addl $16, %%esi		\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+512), "r" (f+256)
+		:"%esi", "memory"
+	);
+    return 2*256;
+}
+
+static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){
+    int32_t * f = (int32_t *) _f;
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq "MANGLE(magicF2W)", %%mm7	\n\t"
+		"pxor %%mm6, %%mm6		\n\t"
+		"movq %%mm7, %%mm5		\n\t"
+		"punpckldq %%mm6, %%mm5		\n\t"
+		"1:				\n\t"
+		"movd (%1, %%esi), %%mm0	\n\t"
+		"punpckldq 2048(%1, %%esi), %%mm0\n\t"
+		"movd 1024(%1, %%esi), %%mm1	\n\t"
+		"punpckldq 4(%1, %%esi), %%mm1	\n\t"
+		"movd 2052(%1, %%esi), %%mm2	\n\t"
+		"movq %%mm7, %%mm3		\n\t"
+		"punpckldq 1028(%1, %%esi), %%mm3\n\t"
+		"movd 8(%1, %%esi), %%mm4	\n\t"
+		"punpckldq 2056(%1, %%esi), %%mm4\n\t"
+		"leal (%%esi, %%esi, 4), %%edi	\n\t"
+		"sarl $1, %%edi			\n\t"
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm5, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"packssdw %%mm6, %%mm0		\n\t"
+		"packssdw %%mm2, %%mm1		\n\t"
+		"packssdw %%mm4, %%mm3		\n\t"
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"movq %%mm1, 8(%0, %%edi)	\n\t"
+		"movq %%mm3, 16(%0, %%edi)	\n\t"
+		
+		"movd 1032(%1, %%esi), %%mm1	\n\t"
+		"punpckldq 12(%1, %%esi), %%mm1\n\t"
+		"movd 2060(%1, %%esi), %%mm2	\n\t"
+		"movq %%mm7, %%mm3		\n\t"
+		"punpckldq 1036(%1, %%esi), %%mm3\n\t"
+		"pxor %%mm0, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm5, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"packssdw %%mm1, %%mm0		\n\t"
+		"packssdw %%mm3, %%mm2		\n\t"
+		"movq %%mm0, 24(%0, %%edi)	\n\t"
+		"movq %%mm2, 32(%0, %%edi)	\n\t"
+				
+		"addl $16, %%esi		\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1280), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+    return 5*256;
+}
+
+static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){
+    int32_t * f = (int32_t *) _f;
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq "MANGLE(magicF2W)", %%mm7	\n\t"
+		"1:				\n\t"
+		"movq (%1, %%esi), %%mm0	\n\t"
+		"movq 8(%1, %%esi), %%mm1	\n\t"
+		"movq 1024(%1, %%esi), %%mm2	\n\t"
+		"movq 1032(%1, %%esi), %%mm3	\n\t"
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"packssdw %%mm1, %%mm0		\n\t"
+		"packssdw %%mm3, %%mm2		\n\t"
+		"movq 2048(%1, %%esi), %%mm3	\n\t"
+		"movq 2056(%1, %%esi), %%mm4	\n\t"
+		"movq 3072(%1, %%esi), %%mm5	\n\t"
+		"movq 3080(%1, %%esi), %%mm6	\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"psubd %%mm7, %%mm6		\n\t"
+		"packssdw %%mm4, %%mm3		\n\t"
+		"packssdw %%mm6, %%mm5		\n\t"
+		"movq %%mm0, %%mm1		\n\t"
+		"movq %%mm3, %%mm4		\n\t"
+		"punpcklwd %%mm2, %%mm0		\n\t"
+		"punpckhwd %%mm2, %%mm1		\n\t"
+		"punpcklwd %%mm5, %%mm3		\n\t"
+		"punpckhwd %%mm5, %%mm4		\n\t"
+		"movq %%mm0, %%mm2		\n\t"
+		"movq %%mm1, %%mm5		\n\t"
+		"punpckldq %%mm3, %%mm0		\n\t"
+		"punpckhdq %%mm3, %%mm2		\n\t"
+		"punpckldq %%mm4, %%mm1		\n\t"
+		"punpckhdq %%mm4, %%mm5		\n\t"
+		"movq %%mm0, (%0, %%esi,2)	\n\t"
+		"movq %%mm2, 8(%0, %%esi,2)	\n\t"
+		"movq %%mm1, 16(%0, %%esi,2)	\n\t"
+		"movq %%mm5, 24(%0, %%esi,2)	\n\t"
+		"addl $16, %%esi		\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1024), "r" (f+256)
+		:"%esi", "memory"
+	);
+    return 4*256;
+}
+
+static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){
+    int32_t * f = (int32_t *) _f;
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq "MANGLE(magicF2W)", %%mm7	\n\t"
+		"1:				\n\t"
+		"movd (%1, %%esi), %%mm0	\n\t"
+		"punpckldq 2048(%1, %%esi), %%mm0\n\t"
+		"movd 3072(%1, %%esi), %%mm1	\n\t"
+		"punpckldq 4096(%1, %%esi), %%mm1\n\t"
+		"movd 1024(%1, %%esi), %%mm2	\n\t"
+		"punpckldq 4(%1, %%esi), %%mm2	\n\t"
+		"movd 2052(%1, %%esi), %%mm3	\n\t"
+		"punpckldq 3076(%1, %%esi), %%mm3\n\t"
+		"movd 4100(%1, %%esi), %%mm4	\n\t"
+		"punpckldq 1028(%1, %%esi), %%mm4\n\t"
+		"movd 8(%1, %%esi), %%mm5	\n\t"
+		"punpckldq 2056(%1, %%esi), %%mm5\n\t"
+		"leal (%%esi, %%esi, 4), %%edi	\n\t"
+		"sarl $1, %%edi			\n\t"
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"packssdw %%mm1, %%mm0		\n\t"
+		"packssdw %%mm3, %%mm2		\n\t"
+		"packssdw %%mm5, %%mm4		\n\t"
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"movq %%mm2, 8(%0, %%edi)	\n\t"
+		"movq %%mm4, 16(%0, %%edi)	\n\t"
+		
+		"movd 3080(%1, %%esi), %%mm0	\n\t"
+		"punpckldq 4104(%1, %%esi), %%mm0\n\t"
+		"movd 1032(%1, %%esi), %%mm1	\n\t"
+		"punpckldq 12(%1, %%esi), %%mm1\n\t"
+		"movd 2060(%1, %%esi), %%mm2	\n\t"
+		"punpckldq 3084(%1, %%esi), %%mm2\n\t"
+		"movd 4108(%1, %%esi), %%mm3	\n\t"
+		"punpckldq 1036(%1, %%esi), %%mm3\n\t"
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"packssdw %%mm1, %%mm0		\n\t"
+		"packssdw %%mm3, %%mm2		\n\t"
+		"movq %%mm0, 24(%0, %%edi)	\n\t"
+		"movq %%mm2, 32(%0, %%edi)	\n\t"
+				
+		"addl $16, %%esi		\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1280), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+    return 5*256;
+}
+
+static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){
+    int32_t * f = (int32_t *) _f;
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq "MANGLE(magicF2W)", %%mm7	\n\t"
+		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 1032(%1, %%esi), %%mm1	\n\t"
+		"movq (%1, %%esi), %%mm2	\n\t"
+		"movq 8(%1, %%esi), %%mm3	\n\t"
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"packssdw %%mm1, %%mm0		\n\t"
+		"packssdw %%mm3, %%mm2		\n\t"
+		"movq %%mm0, %%mm1		\n\t"
+		"punpcklwd %%mm2, %%mm0		\n\t"
+		"punpckhwd %%mm2, %%mm1		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		"movq %%mm6, (%0, %%edi)	\n\t"
+		"movd %%mm0, 8(%0, %%edi)	\n\t"
+		"punpckhdq %%mm0, %%mm0		\n\t"
+		"movq %%mm6, 12(%0, %%edi)	\n\t"
+		"movd %%mm0, 20(%0, %%edi)	\n\t"
+		"movq %%mm6, 24(%0, %%edi)	\n\t"
+		"movd %%mm1, 32(%0, %%edi)	\n\t"
+		"punpckhdq %%mm1, %%mm1		\n\t"
+		"movq %%mm6, 36(%0, %%edi)	\n\t"
+		"movd %%mm1, 44(%0, %%edi)	\n\t"
+		"addl $16, %%esi		\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+    return 6*256;
+}
+
+static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){
+    int32_t * f = (int32_t *) _f;
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq "MANGLE(magicF2W)", %%mm7	\n\t"
+		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 2048(%1, %%esi), %%mm1	\n\t"
+		"movq (%1, %%esi), %%mm5	\n\t" 
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		
+		"pxor %%mm4, %%mm4		\n\t"
+		"packssdw %%mm5, %%mm0		\n\t" // FfAa
+		"packssdw %%mm4, %%mm1		\n\t" // 00Bb
+		"punpckhwd %%mm0, %%mm4		\n\t" // F0f0
+		"punpcklwd %%mm1, %%mm0		\n\t" // BAba
+		"movq %%mm0, %%mm1		\n\t" // BAba
+		"punpckldq %%mm4, %%mm3		\n\t" // f0XX
+		"punpckldq %%mm6, %%mm0		\n\t" // 00ba
+		"punpckhdq %%mm1, %%mm3		\n\t" // BAf0
+		
+		"movq %%mm0, (%0, %%edi)	\n\t" // 00ba
+		"punpckhdq %%mm4, %%mm0		\n\t" // F000
+		"movq %%mm3, 8(%0, %%edi)	\n\t" // BAf0
+		"movq %%mm0, 16(%0, %%edi)	\n\t" // F000
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+    return 6*256;
+}
+
+static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){
+    int32_t * f = (int32_t *) _f;
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq "MANGLE(magicF2W)", %%mm7	\n\t"
+		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 3072(%1, %%esi), %%mm1	\n\t"
+		"movq 2048(%1, %%esi), %%mm4	\n\t"
+		"movq (%1, %%esi), %%mm5	\n\t" 
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		
+		"packssdw %%mm4, %%mm0		\n\t" // EeAa
+		"packssdw %%mm5, %%mm1		\n\t" // FfBb
+		"movq %%mm0, %%mm2		\n\t" // EeAa
+		"punpcklwd %%mm1, %%mm0		\n\t" // BAba
+		"punpckhwd %%mm1, %%mm2		\n\t" // FEfe
+		"movq %%mm0, %%mm1		\n\t" // BAba
+		"punpckldq %%mm6, %%mm0		\n\t" // 00ba
+		"punpckhdq %%mm1, %%mm1		\n\t" // BABA
+		
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"punpckhdq %%mm2, %%mm0		\n\t" // FE00
+		"punpckldq %%mm1, %%mm2		\n\t" // BAfe
+		"movq %%mm2, 8(%0, %%edi)	\n\t"
+		"movq %%mm0, 16(%0, %%edi)	\n\t"
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+    return 6*256;
+}
+
+static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
+    int32_t * f = (int32_t *) _f;
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq "MANGLE(magicF2W)", %%mm7	\n\t"
+//		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 2048(%1, %%esi), %%mm1	\n\t"
+		"movq 3072(%1, %%esi), %%mm2	\n\t"
+		"movq 4096(%1, %%esi), %%mm3	\n\t"
+		"movq (%1, %%esi), %%mm5	\n\t" 
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		
+		"packssdw %%mm2, %%mm0		\n\t" // CcAa
+		"packssdw %%mm3, %%mm1		\n\t" // DdBb
+		"packssdw %%mm5, %%mm5		\n\t" // FfFf
+		"movq %%mm0, %%mm2		\n\t" // CcAa
+		"punpcklwd %%mm1, %%mm0		\n\t" // BAba
+		"punpckhwd %%mm1, %%mm2		\n\t" // DCdc
+		"pxor %%mm4, %%mm4		\n\t" // 0000
+		"punpcklwd %%mm5, %%mm4		\n\t" // F0f0
+		"movq %%mm0, %%mm1		\n\t" // BAba
+		"movq %%mm4, %%mm3		\n\t" // F0f0
+		"punpckldq %%mm2, %%mm0		\n\t" // dcba
+		"punpckhdq %%mm1, %%mm1		\n\t" // BABA
+		"punpckldq %%mm1, %%mm4		\n\t" // BAf0
+		"punpckhdq %%mm3, %%mm2		\n\t" // F0DC
+		
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"movq %%mm4, 8(%0, %%edi)	\n\t"
+		"movq %%mm2, 16(%0, %%edi)	\n\t"
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+    return 6*256;
+}
+
+static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
+    int32_t * f = (int32_t *) _f;
+	asm volatile(
+		"movl $-1024, %%esi		\n\t"
+		"movq "MANGLE(magicF2W)", %%mm7	\n\t"
+//		"pxor %%mm6, %%mm6		\n\t"
+		"1:				\n\t"
+		"movq 1024(%1, %%esi), %%mm0	\n\t"
+		"movq 3072(%1, %%esi), %%mm1	\n\t"
+		"movq 4096(%1, %%esi), %%mm2	\n\t"
+		"movq 5120(%1, %%esi), %%mm3	\n\t"
+		"movq 2048(%1, %%esi), %%mm4	\n\t"
+		"movq (%1, %%esi), %%mm5	\n\t" 
+		"psubd %%mm7, %%mm0		\n\t"
+		"psubd %%mm7, %%mm1		\n\t"
+		"psubd %%mm7, %%mm2		\n\t"
+		"psubd %%mm7, %%mm3		\n\t"
+		"psubd %%mm7, %%mm4		\n\t"
+		"psubd %%mm7, %%mm5		\n\t"
+		"leal (%%esi, %%esi, 2), %%edi	\n\t"
+		
+		"packssdw %%mm2, %%mm0		\n\t" // CcAa
+		"packssdw %%mm3, %%mm1		\n\t" // DdBb
+		"packssdw %%mm4, %%mm4		\n\t" // EeEe
+		"packssdw %%mm5, %%mm5		\n\t" // FfFf
+		"movq %%mm0, %%mm2		\n\t" // CcAa
+		"punpcklwd %%mm1, %%mm0		\n\t" // BAba
+		"punpckhwd %%mm1, %%mm2		\n\t" // DCdc
+		"punpcklwd %%mm5, %%mm4		\n\t" // FEfe
+		"movq %%mm0, %%mm1		\n\t" // BAba
+		"movq %%mm4, %%mm3		\n\t" // FEfe
+		"punpckldq %%mm2, %%mm0		\n\t" // dcba
+		"punpckhdq %%mm1, %%mm1		\n\t" // BABA
+		"punpckldq %%mm1, %%mm4		\n\t" // BAfe
+		"punpckhdq %%mm3, %%mm2		\n\t" // FEDC
+		
+		"movq %%mm0, (%0, %%edi)	\n\t"
+		"movq %%mm4, 8(%0, %%edi)	\n\t"
+		"movq %%mm2, 16(%0, %%edi)	\n\t"
+		"addl $8, %%esi			\n\t"
+		" jnz 1b			\n\t"
+		"emms				\n\t"
+		:: "r" (s16+1536), "r" (f+256)
+		:"%esi", "%edi", "memory"
+	);
+    return 6*256;
+}
+
+
+static void* a52_resample_MMX(int flags, int ch){
+    switch (flags) {
+    case A52_MONO:
+	if(ch==5) return a52_resample_MONO_to_5_MMX;
+	break;
+    case A52_CHANNEL:
+    case A52_STEREO:
+    case A52_DOLBY:
+	if(ch==2) return a52_resample_STEREO_to_2_MMX;
+	break;
+    case A52_3F:
+	if(ch==5) return a52_resample_3F_to_5_MMX;
+	break;
+    case A52_2F2R:
+	if(ch==4) return a52_resample_2F_2R_to_4_MMX;
+	break;
+    case A52_3F2R:
+	if(ch==5) return a52_resample_3F_2R_to_5_MMX;
+	break;
+    case A52_MONO | A52_LFE:
+	if(ch==6) return a52_resample_MONO_LFE_to_6_MMX;
+	break;
+    case A52_CHANNEL | A52_LFE:
+    case A52_STEREO | A52_LFE:
+    case A52_DOLBY | A52_LFE:
+	if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX;
+	break;
+    case A52_3F | A52_LFE:
+	if(ch==6) return a52_resample_3F_LFE_to_6_MMX;
+	break;
+    case A52_2F2R | A52_LFE:
+	if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX;
+	break;
+    case A52_3F2R | A52_LFE:
+	if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX;
+	break;
+    }
+    return NULL;
+}
+
+
author	Arpi <arpi@thot.banki.hu>	2003-04-16 20:03:07 +0000
committer	Arpi <arpi@thot.banki.hu>	2003-04-16 20:03:07 +0000
commit	1a7c3c85622fa0ff48127cfe38ece15e27bfb17c (patch)
tree	d2129e00537036d9851375e045518d4e2b1d4597 /libavcodec
parent	6814a25c676ae3f0eb73a2d7180b7fe9e62a62ec (diff)