exploit mdct symmetry

2% faster vorbis on conroe, k8. 7% on celeron. Originally committed as revision 14207 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Loren Merritt <lorenm@u.washington.edu> 2008-07-13 15:03:58 +0000
committer: Loren Merritt <lorenm@u.washington.edu> 2008-07-13 15:03:58 +0000
commit: b9fa32082c71013e90eab9e9997967d2939cf4a6 (patch)
tree: 83edd135988c73a75b017fbd12396e156de5e0a4 /libavcodec/i386
parent: eb2cd99c73df74cba8ce0173f9ee2b70313adaa6 (diff)
3 files changed, 161 insertions, 22 deletions
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index 5ee168b3e3..db8be862dd 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2022,33 +2022,71 @@ static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *
         ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
 }
 
+static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
+                                      const float *win, float add_bias, int len){
+#ifdef HAVE_6REGS
+    if(add_bias == 0){
+        x86_reg i = -len*4;
+        x86_reg j = len*4-8;
+        asm volatile(
+            "1: \n"
+            "pswapd  (%5,%1), %%mm1 \n"
+            "movq    (%5,%0), %%mm0 \n"
+            "pswapd  (%4,%1), %%mm5 \n"
+            "movq    (%3,%0), %%mm4 \n"
+            "movq      %%mm0, %%mm2 \n"
+            "movq      %%mm1, %%mm3 \n"
+            "pfmul     %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
+            "pfmul     %%mm5, %%mm3 \n" // src1[    j]*win[len+j]
+            "pfmul     %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
+            "pfmul     %%mm5, %%mm0 \n" // src1[    j]*win[len+i]
+            "pfadd     %%mm3, %%mm2 \n"
+            "pfsub     %%mm0, %%mm1 \n"
+            "pswapd    %%mm2, %%mm2 \n"
+            "movq      %%mm1, (%2,%0) \n"
+            "movq      %%mm2, (%2,%1) \n"
+            "sub $8, %1 \n"
+            "add $8, %0 \n"
+            "jl 1b \n"
+            "femms \n"
+            :"+r"(i), "+r"(j)
+            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
+        );
+    }else
+#endif
+        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
+}
+
 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
                                    const float *win, float add_bias, int len){
 #ifdef HAVE_6REGS
     if(add_bias == 0){
-        x86_reg i = -len*2;
-        x86_reg j = len*2-16;
+        x86_reg i = -len*4;
+        x86_reg j = len*4-16;
         asm volatile(
             "1: \n"
-            "movaps       (%5,%0), %%xmm0 \n"
             "movaps       (%5,%1), %%xmm1 \n"
+            "movaps       (%5,%0), %%xmm0 \n"
+            "movaps       (%4,%1), %%xmm5 \n"
+            "movaps       (%3,%0), %%xmm4 \n"
+            "shufps $0x1b, %%xmm1, %%xmm1 \n"
+            "shufps $0x1b, %%xmm5, %%xmm5 \n"
             "movaps        %%xmm0, %%xmm2 \n"
             "movaps        %%xmm1, %%xmm3 \n"
+            "mulps         %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
+            "mulps         %%xmm5, %%xmm3 \n" // src1[    j]*win[len+j]
+            "mulps         %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
+            "mulps         %%xmm5, %%xmm0 \n" // src1[    j]*win[len+i]
+            "addps         %%xmm3, %%xmm2 \n"
+            "subps         %%xmm0, %%xmm1 \n"
             "shufps $0x1b, %%xmm2, %%xmm2 \n"
-            "shufps $0x1b, %%xmm3, %%xmm3 \n"
-            "mulps        (%4,%0), %%xmm0 \n"
-            "mulps        (%4,%1), %%xmm1 \n"
-            "mulps        (%3,%0), %%xmm3 \n"
-            "mulps        (%3,%1), %%xmm2 \n"
-            "addps         %%xmm3, %%xmm0 \n"
-            "addps         %%xmm2, %%xmm1 \n"
-            "movaps        %%xmm0, (%2,%0) \n"
-            "movaps        %%xmm1, (%2,%1) \n"
+            "movaps        %%xmm1, (%2,%0) \n"
+            "movaps        %%xmm2, (%2,%1) \n"
             "sub $16, %1 \n"
             "add $16, %0 \n"
             "jl 1b \n"
             :"+r"(i), "+r"(j)
-            :"r"(dst+len/2), "r"(src0+len/2), "r"(src1+len/2), "r"(win+len/2)
+            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
         );
     }else
 #endif
@@ -2638,8 +2676,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
             }
         }
-        if(mm_flags & MM_3DNOWEXT)
+        if(mm_flags & MM_3DNOWEXT){
             c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
+            c->vector_fmul_window = vector_fmul_window_3dnow2;
+        }
         if(mm_flags & MM_SSE){
             c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
             c->vector_fmul = vector_fmul_sse;
diff --git a/libavcodec/i386/fft_3dn2.c b/libavcodec/i386/fft_3dn2.c
index 32c4be369b..9068dff24b 100644
--- a/libavcodec/i386/fft_3dn2.c
+++ b/libavcodec/i386/fft_3dn2.c
@@ -124,10 +124,9 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
     asm volatile("femms");
 }
 
-void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
-                        const FFTSample *input, FFTSample *tmp)
+static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
 {
-    long n8, n4, n2, n;
+    long n4, n2, n;
     x86_reg k;
     const uint16_t *revtab = s->fft.revtab;
     const FFTSample *tcos = s->tcos;
@@ -138,7 +137,6 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
     n = 1 << s->nbits;
     n2 = n >> 1;
     n4 = n >> 2;
-    n8 = n >> 3;
 
     /* pre rotation */
     in1 = input;
@@ -182,6 +180,20 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
             :"m"(tcos[k]), "m"(tsin[k])
         );
     }
+}
+
+void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
+                        const FFTSample *input, FFTSample *tmp)
+{
+    x86_reg k;
+    long n8, n2, n;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n2 = n >> 1;
+    n8 = n >> 3;
+
+    imdct_3dn2(s, input, tmp);
 
     k = n-8;
     asm volatile("movd %0, %%mm7" ::"r"(1<<31));
@@ -212,3 +224,40 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
     asm volatile("femms");
 }
 
+void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
+                        const FFTSample *input, FFTSample *tmp)
+{
+    x86_reg j, k;
+    long n8, n4, n;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    imdct_3dn2(s, input, tmp);
+
+    j = -n;
+    k = n-8;
+    asm volatile("movd %0, %%mm7" ::"r"(1<<31));
+    asm volatile(
+        "1: \n\t"
+        "movq    (%3,%1), %%mm0 \n\t" // z[n8+k]
+        "pswapd  (%3,%0), %%mm1 \n\t" // z[n8-1-k]
+        "movq      %%mm0, %%mm2 \n\t"
+        "punpckldq %%mm1, %%mm0 \n\t"
+        "punpckhdq %%mm2, %%mm1 \n\t"
+        "pxor      %%mm7, %%mm0 \n\t"
+        "pxor      %%mm7, %%mm1 \n\t"
+        "movq      %%mm0, (%2,%1) \n\t" // output[n4+2*k]   = { -z[n8+k].re, z[n8-1-k].im }
+        "movq      %%mm1, (%2,%0) \n\t" // output[n4-2-2*k] = { -z[n8-1-k].re, z[n8+k].im }
+        "sub $8, %1 \n\t"
+        "add $8, %0 \n\t"
+        "jl 1b \n\t"
+        :"+r"(j), "+r"(k)
+        :"r"(output+n4), "r"(z+n8)
+        :"memory"
+    );
+    asm volatile("femms");
+}
+
diff --git a/libavcodec/i386/fft_sse.c b/libavcodec/i386/fft_sse.c
index 83cbd87088..305f44a0ce 100644
--- a/libavcodec/i386/fft_sse.c
+++ b/libavcodec/i386/fft_sse.c
@@ -142,11 +142,10 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
     } while (nblocks != 0);
 }
 
-void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
-                       const FFTSample *input, FFTSample *tmp)
+static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
 {
     x86_reg k;
-    long n8, n4, n2, n;
+    long n4, n2, n;
     const uint16_t *revtab = s->fft.revtab;
     const FFTSample *tcos = s->tcos;
     const FFTSample *tsin = s->tsin;
@@ -156,7 +155,6 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
     n = 1 << s->nbits;
     n2 = n >> 1;
     n4 = n >> 2;
-    n8 = n >> 3;
 
 #ifdef ARCH_X86_64
     asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1));
@@ -260,6 +258,20 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
 #endif
         );
     }
+}
+
+void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
+                       const FFTSample *input, FFTSample *tmp)
+{
+    x86_reg k;
+    long n8, n2, n;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n2 = n >> 1;
+    n8 = n >> 3;
+
+    imdct_sse(s, input, tmp);
 
     /*
        Mnemonics:
@@ -301,3 +313,41 @@ void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
     );
 }
 
+void ff_imdct_half_sse(MDCTContext *s, FFTSample *output,
+                       const FFTSample *input, FFTSample *tmp)
+{
+    x86_reg j, k;
+    long n8, n4, n;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    imdct_sse(s, input, tmp);
+
+    j = -n;
+    k = n-16;
+    asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
+    asm volatile(
+        "1: \n\t"
+        "movaps     (%3,%1), %%xmm0 \n\t"
+        "movaps     (%3,%0), %%xmm1 \n\t"
+        "xorps       %%xmm7, %%xmm0 \n\t"
+        "movaps      %%xmm0, %%xmm2 \n\t"
+        "shufps $141,%%xmm1, %%xmm0 \n\t"
+        "shufps $216,%%xmm1, %%xmm2 \n\t"
+        "shufps $54, %%xmm0, %%xmm0 \n\t"
+        "shufps $156,%%xmm2, %%xmm2 \n\t"
+        "xorps       %%xmm7, %%xmm0 \n\t"
+        "movaps      %%xmm2, (%2,%1) \n\t"
+        "movaps      %%xmm0, (%2,%0) \n\t"
+        "sub $16, %1 \n\t"
+        "add $16, %0 \n\t"
+        "jl 1b \n\t"
+        :"+r"(j), "+r"(k)
+        :"r"(output+n4), "r"(z+n8)
+        :"memory"
+    );
+}
+
author	Loren Merritt <lorenm@u.washington.edu>	2008-07-13 15:03:58 +0000
committer	Loren Merritt <lorenm@u.washington.edu>	2008-07-13 15:03:58 +0000
commit	b9fa32082c71013e90eab9e9997967d2939cf4a6 (patch)
tree	83edd135988c73a75b017fbd12396e156de5e0a4 /libavcodec/i386
parent	eb2cd99c73df74cba8ce0173f9ee2b70313adaa6 (diff)