3dnow2 implementation of imdct.

6% faster vorbis and wma. Originally committed as revision 5954 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Loren Merritt <lorenm@u.washington.edu> 2006-08-08 04:01:04 +0000
committer: Loren Merritt <lorenm@u.washington.edu> 2006-08-08 04:01:04 +0000
commit: bcfa3e58ee0ec7f8a739867ea66f9acb834e498a (patch)
tree: a7aec9632c7157f03266a46861d4a898ee2340f5 /libavcodec/i386/fft_3dn2.c
parent: 2c5ad5fd74a44145459e74acdf486c084f8de4b4 (diff)
1 files changed, 82 insertions, 1 deletions
diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c
index aa8f0aee2e..40ec9d8eb1 100644
--- a/libavcodec/i386/fft_3dn2.c
+++ b/libavcodec/i386/fft_3dn2.c
@@ -1,6 +1,6 @@
 /*
  * FFT/MDCT transform with Extended 3DNow! optimizations
- * Copyright (c) 2006 Zuxy MENG Jie.
+ * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
  * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
  *
  * This library is free software; you can redistribute it and/or
@@ -134,3 +134,84 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
 }
 
 #endif
+
+void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
+                        const FFTSample *input, FFTSample *tmp)
+{
+    int k, n8, n4, n2, n;
+    const uint16_t *revtab = s->fft.revtab;
+    const FFTSample *tcos = s->tcos;
+    const FFTSample *tsin = s->tsin;
+    const FFTSample *in1, *in2;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n2 = n >> 1;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    /* pre rotation */
+    in1 = input;
+    in2 = input + n2 - 1;
+    for(k = 0; k < n4; k++) {
+        asm volatile(
+            "movd       %1, %%mm0 \n\t"
+            "movd       %3, %%mm1 \n\t"
+            "punpckldq  %2, %%mm0 \n\t"
+            "punpckldq  %4, %%mm1 \n\t"
+            "movq    %%mm0, %%mm2 \n\t"
+            "pfmul   %%mm1, %%mm0 \n\t"
+            "pswapd  %%mm1, %%mm1 \n\t"
+            "pfmul   %%mm1, %%mm2 \n\t"
+            "pfpnacc %%mm2, %%mm0 \n\t"
+            "movq    %%mm0, %0    \n\t"
+            :"=m"(z[revtab[k]])
+            :"m"(in2[-2*k]), "m"(in1[2*k]),
+             "m"(tcos[k]), "m"(tsin[k])
+        );
+    }
+
+    ff_fft_calc(&s->fft, z);
+
+    /* post rotation + reordering */
+    for(k = 0; k < n4; k++) {
+        asm volatile(
+            "movq       %0, %%mm0 \n\t"
+            "movd       %1, %%mm1 \n\t"
+            "punpckldq  %2, %%mm1 \n\t"
+            "movq    %%mm0, %%mm2 \n\t"
+            "pfmul   %%mm1, %%mm0 \n\t"
+            "pswapd  %%mm1, %%mm1 \n\t"
+            "pfmul   %%mm1, %%mm2 \n\t"
+            "pfpnacc %%mm2, %%mm0 \n\t"
+            "movq    %%mm0, %0    \n\t"
+            :"+m"(z[k])
+            :"m"(tcos[k]), "m"(tsin[k])
+        );
+    }
+
+    asm volatile("movd %0, %%mm7" ::"r"(1<<31));
+    for(k = 0; k < n8; k++) {
+        asm volatile(
+            "movq         %4, %%mm0 \n\t"
+            "pswapd       %5, %%mm1 \n\t"
+            "movq      %%mm0, %%mm2 \n\t"
+            "pxor      %%mm7, %%mm2 \n\t"
+            "punpckldq %%mm1, %%mm2 \n\t"
+            "pswapd    %%mm2, %%mm3 \n\t"
+            "punpckhdq %%mm1, %%mm0 \n\t"
+            "pswapd    %%mm0, %%mm4 \n\t"
+            "pxor      %%mm7, %%mm0 \n\t"
+            "pxor      %%mm7, %%mm4 \n\t"
+            "movq      %%mm0, %0    \n\t" // { -z[n8+k].im, z[n8-1-k].re }
+            "movq      %%mm4, %1    \n\t" // { -z[n8-1-k].re, z[n8+k].im }
+            "movq      %%mm2, %2    \n\t" // { -z[n8+k].re, z[n8-1-k].im }
+            "movq      %%mm3, %3    \n\t" // { z[n8-1-k].im, -z[n8+k].re }
+            :"=m"(output[2*k]), "=m"(output[n2-2-2*k]),
+             "=m"(output[n2+2*k]), "=m"(output[n-2-2*k])
+            :"m"(z[n8+k]), "m"(z[n8-1-k])
+            :"memory"
+        );
+    }
+    asm volatile("emms");
+}
author	Loren Merritt <lorenm@u.washington.edu>	2006-08-08 04:01:04 +0000
committer	Loren Merritt <lorenm@u.washington.edu>	2006-08-08 04:01:04 +0000
commit	bcfa3e58ee0ec7f8a739867ea66f9acb834e498a (patch)
tree	a7aec9632c7157f03266a46861d4a898ee2340f5 /libavcodec/i386/fft_3dn2.c
parent	2c5ad5fd74a44145459e74acdf486c084f8de4b4 (diff)