sse & sse2 implementations of vorbis channel coupling.

9% faster vorbis (on a K8). Originally committed as revision 5898 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Loren Merritt <lorenm@u.washington.edu> 2006-08-03 03:18:47 +0000
committer: Loren Merritt <lorenm@u.washington.edu> 2006-08-03 03:18:47 +0000
commit: 2dac4acfc0f2abbe28082cdb5c3ed775a78d2867 (patch)
tree: ae3bf6a7ddd9bb5bf29a305eef842488629965d9
parent: 7bf0049623652b92a566999d37f0b481c2056d6e (diff)
5 files changed, 95 insertions, 20 deletions
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 9b79b8659d..937dceb2c4 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -35,6 +35,9 @@
 /* snow.c */
 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
 
+/* vorbis.c */
+void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
+
 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
 uint32_t squareTbl[512] = {0, };
 
@@ -4090,6 +4093,10 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->inner_add_yblock = ff_snow_inner_add_yblock;
 #endif
 
+#ifdef CONFIG_VORBIS_DECODER
+    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
+#endif
+
     c->shrink[0]= ff_img_copy_plane;
     c->shrink[1]= ff_shrink22;
     c->shrink[2]= ff_shrink44;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index a2a5171129..a608350294 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -307,6 +307,8 @@ typedef struct DSPContext {
 
     void (*h261_loop_filter)(uint8_t *src, int stride);
 
+    void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
+
     /* (I)DCT */
     void (*fdct)(DCTELEM *block/* align 16*/);
     void (*fdct248)(DCTELEM *block/* align 16*/);
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index ec6b2ad1a7..afcb02e4db 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2711,6 +2711,59 @@ static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
 }
 #endif
 
+static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
+{
+    int i;
+    asm volatile("pxor %%mm7, %%mm7":);
+    for(i=0; i<blocksize; i+=2) {
+        asm volatile(
+            "movq    %0,    %%mm0 \n\t"
+            "movq    %1,    %%mm1 \n\t"
+            "movq    %%mm0, %%mm2 \n\t"
+            "movq    %%mm1, %%mm3 \n\t"
+            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
+            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
+            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
+            "pxor    %%mm2, %%mm1 \n\t"
+            "movq    %%mm3, %%mm4 \n\t"
+            "pand    %%mm1, %%mm3 \n\t"
+            "pandn   %%mm1, %%mm4 \n\t"
+            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+            "movq    %%mm3, %1    \n\t"
+            "movq    %%mm0, %0    \n\t"
+            :"+m"(mag[i]), "+m"(ang[i])
+            ::"memory"
+        );
+    }
+    asm volatile("emms");
+}
+static void vorbis_inverse_coupling_sse2(float *mag, float *ang, int blocksize)
+{
+    int i;
+    for(i=0; i<blocksize; i+=4) {
+        asm volatile(
+            "movaps  %0,     %%xmm0 \n\t"
+            "movaps  %1,     %%xmm1 \n\t"
+            "pxor    %%xmm2, %%xmm2 \n\t"
+            "pxor    %%xmm3, %%xmm3 \n\t"
+            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
+            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
+            "pslld   $31,    %%xmm2 \n\t" // keep only the sign bit
+            "pxor    %%xmm2, %%xmm1 \n\t"
+            "movaps  %%xmm3, %%xmm4 \n\t"
+            "pand    %%xmm1, %%xmm3 \n\t"
+            "pandn   %%xmm1, %%xmm4 \n\t"
+            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+            "movaps  %%xmm3, %1     \n\t"
+            "movaps  %%xmm0, %0     \n\t"
+            :"+m"(mag[i]), "+m"(ang[i])
+            ::"memory"
+        );
+    }
+}
+
 #ifdef CONFIG_SNOW_ENCODER
 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
 extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
@@ -3137,6 +3190,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
         }
 #endif
+
+        if(mm_flags & MM_SSE2)
+            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse2;
+        else if(mm_flags & MM_SSE)
+            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
     }
 
 #ifdef CONFIG_ENCODERS
diff --git a/libavcodec/vorbis.c b/libavcodec/vorbis.c
index 9adec4bed7..cdf7cee579 100644
--- a/libavcodec/vorbis.c
+++ b/libavcodec/vorbis.c
@@ -929,6 +929,7 @@ static int vorbis_decode_init(AVCodecContext *avccontext) {
     int i, j, hdr_type;
 
     vc->avccontext = avccontext;
+    dsputil_init(&vc->dsp, avccontext);
 
     if (!headers_len) {
         av_log(avccontext, AV_LOG_ERROR, "Extradata corrupt.\n");
@@ -1443,6 +1444,31 @@ static int vorbis_residue_decode(vorbis_context *vc, vorbis_residue *vr, uint_fa
     return 0;
 }
 
+void vorbis_inverse_coupling(float *mag, float *ang, int blocksize)
+{
+    int i;
+    for(i=0; i<blocksize; i++)
+    {
+        if (mag[i]>0.0) {
+            if (ang[i]>0.0) {
+                ang[i]=mag[i]-ang[i];
+            } else {
+                float temp=ang[i];
+                ang[i]=mag[i];
+                mag[i]+=temp;
+            }
+        } else {
+            if (ang[i]>0.0) {
+                ang[i]+=mag[i];
+            } else {
+                float temp=ang[i];
+                ang[i]=mag[i];
+                mag[i]-=temp;
+            }
+        }
+    }
+}
+
 // Decode the audio packet using the functions above
 #define BIAS 385
 
@@ -1541,26 +1567,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) {
 
         mag=vc->channel_residues+res_chan[mapping->magnitude[i]]*blocksize/2;
         ang=vc->channel_residues+res_chan[mapping->angle[i]]*blocksize/2;
-        for(j=0;j<blocksize/2;++j) {
-            float temp;
-            if (mag[j]>0.0) {
-                if (ang[j]>0.0) {
-                    ang[j]=mag[j]-ang[j];
-                } else {
-                    temp=ang[j];
-                    ang[j]=mag[j];
-                    mag[j]+=temp;
-                }
-            } else {
-                if (ang[j]>0.0) {
-                    ang[j]+=mag[j];
-                } else {
-                    temp=ang[j];
-                    ang[j]=mag[j];
-                    mag[j]-=temp;
-                }
-            }
-        }
+        vc->dsp.vorbis_inverse_coupling(mag, ang, blocksize/2);
     }
 
 // Dotproduct
diff --git a/libavcodec/vorbis.h b/libavcodec/vorbis.h
index c818207d92..1274f1891f 100644
--- a/libavcodec/vorbis.h
+++ b/libavcodec/vorbis.h
@@ -87,6 +87,7 @@ typedef struct {
 typedef struct vorbis_context_s {
     AVCodecContext *avccontext;
     GetBitContext gb;
+    DSPContext dsp;
 
     MDCTContext mdct0;
     MDCTContext mdct1;
author	Loren Merritt <lorenm@u.washington.edu>	2006-08-03 03:18:47 +0000
committer	Loren Merritt <lorenm@u.washington.edu>	2006-08-03 03:18:47 +0000
commit	2dac4acfc0f2abbe28082cdb5c3ed775a78d2867 (patch)
tree	ae3bf6a7ddd9bb5bf29a305eef842488629965d9
parent	7bf0049623652b92a566999d37f0b481c2056d6e (diff)