summaryrefslogtreecommitdiff
path: root/libavcodec/i386/fft_sse.c
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2008-08-12 00:26:58 +0000
committerLoren Merritt <lorenm@u.washington.edu>2008-08-12 00:26:58 +0000
commit5d0ddd1a9fcdfbb6b24e75af4384e1d36a1d331e (patch)
tree7395fe9347c87a04885ace06959a8b0c0a940a7e /libavcodec/i386/fft_sse.c
parentbafad220a712f9b3a4fe8cdf5f94b79a9c62dd5a (diff)
split-radix FFT
c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse. Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386/fft_sse.c')
-rw-r--r--libavcodec/i386/fft_sse.c143
1 files changed, 37 insertions, 106 deletions
diff --git a/libavcodec/i386/fft_sse.c b/libavcodec/i386/fft_sse.c
index 305f44a0ce..77a579011a 100644
--- a/libavcodec/i386/fft_sse.c
+++ b/libavcodec/i386/fft_sse.c
@@ -22,124 +22,55 @@
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
-static const int p1p1p1m1[4] __attribute__((aligned(16))) =
- { 0, 0, 0, 1 << 31 };
-
-static const int p1p1m1p1[4] __attribute__((aligned(16))) =
- { 0, 0, 1 << 31, 0 };
-
-static const int p1p1m1m1[4] __attribute__((aligned(16))) =
- { 0, 0, 1 << 31, 1 << 31 };
-
static const int p1m1p1m1[4] __attribute__((aligned(16))) =
{ 0, 1 << 31, 0, 1 << 31 };
static const int m1m1m1m1[4] __attribute__((aligned(16))) =
{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
-#if 0
-static void print_v4sf(const char *str, __m128 a)
-{
- float *p = (float *)&a;
- printf("%s: %f %f %f %f\n",
- str, p[0], p[1], p[2], p[3]);
-}
-#endif
+void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
+void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
-/* XXX: handle reverse case */
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{
- int ln = s->nbits;
- x86_reg i;
- long j;
- long nblocks, nloops;
- FFTComplex *p, *cptr;
+ int n = 1 << s->nbits;
- asm volatile(
- "movaps %0, %%xmm4 \n\t"
- "movaps %1, %%xmm5 \n\t"
- ::"m"(*p1p1m1m1),
- "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
- );
+ ff_fft_dispatch_interleave_sse(z, s->nbits);
- i = 8 << ln;
- asm volatile(
- "1: \n\t"
- "sub $32, %0 \n\t"
- /* do the pass 0 butterfly */
- "movaps (%0,%1), %%xmm0 \n\t"
- "movaps %%xmm0, %%xmm1 \n\t"
- "shufps $0x4E, %%xmm0, %%xmm0 \n\t"
- "xorps %%xmm4, %%xmm1 \n\t"
- "addps %%xmm1, %%xmm0 \n\t"
- "movaps 16(%0,%1), %%xmm2 \n\t"
- "movaps %%xmm2, %%xmm3 \n\t"
- "shufps $0x4E, %%xmm2, %%xmm2 \n\t"
- "xorps %%xmm4, %%xmm3 \n\t"
- "addps %%xmm3, %%xmm2 \n\t"
- /* multiply third by -i */
- /* by toggling the sign bit */
- "shufps $0xB4, %%xmm2, %%xmm2 \n\t"
- "xorps %%xmm5, %%xmm2 \n\t"
- /* do the pass 1 butterfly */
- "movaps %%xmm0, %%xmm1 \n\t"
- "addps %%xmm2, %%xmm0 \n\t"
- "subps %%xmm2, %%xmm1 \n\t"
- "movaps %%xmm0, (%0,%1) \n\t"
- "movaps %%xmm1, 16(%0,%1) \n\t"
- "jg 1b \n\t"
- :"+r"(i)
- :"r"(z)
- );
- /* pass 2 .. ln-1 */
+ if(n <= 16) {
+ x86_reg i = -8*n;
+ asm volatile(
+ "1: \n"
+ "movaps (%0,%1), %%xmm0 \n"
+ "movaps %%xmm0, %%xmm1 \n"
+ "unpcklps 16(%0,%1), %%xmm0 \n"
+ "unpckhps 16(%0,%1), %%xmm1 \n"
+ "movaps %%xmm0, (%0,%1) \n"
+ "movaps %%xmm1, 16(%0,%1) \n"
+ "add $32, %0 \n"
+ "jl 1b \n"
+ :"+r"(i)
+ :"r"(z+n)
+ :"memory"
+ );
+ }
+}
- nblocks = 1 << (ln-3);
- nloops = 1 << 2;
- cptr = s->exptab1;
- do {
- p = z;
- j = nblocks;
- do {
- i = nloops*8;
- asm volatile(
- "1: \n\t"
- "sub $32, %0 \n\t"
- "movaps (%2,%0), %%xmm1 \n\t"
- "movaps (%1,%0), %%xmm0 \n\t"
- "movaps 16(%2,%0), %%xmm5 \n\t"
- "movaps 16(%1,%0), %%xmm4 \n\t"
- "movaps %%xmm1, %%xmm2 \n\t"
- "movaps %%xmm5, %%xmm6 \n\t"
- "shufps $0xA0, %%xmm1, %%xmm1 \n\t"
- "shufps $0xF5, %%xmm2, %%xmm2 \n\t"
- "shufps $0xA0, %%xmm5, %%xmm5 \n\t"
- "shufps $0xF5, %%xmm6, %%xmm6 \n\t"
- "mulps (%3,%0,2), %%xmm1 \n\t" // cre*re cim*re
- "mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im
- "mulps 32(%3,%0,2), %%xmm5 \n\t" // cre*re cim*re
- "mulps 48(%3,%0,2), %%xmm6 \n\t" // -cim*im cre*im
- "addps %%xmm2, %%xmm1 \n\t"
- "addps %%xmm6, %%xmm5 \n\t"
- "movaps %%xmm0, %%xmm3 \n\t"
- "movaps %%xmm4, %%xmm7 \n\t"
- "addps %%xmm1, %%xmm0 \n\t"
- "subps %%xmm1, %%xmm3 \n\t"
- "addps %%xmm5, %%xmm4 \n\t"
- "subps %%xmm5, %%xmm7 \n\t"
- "movaps %%xmm0, (%1,%0) \n\t"
- "movaps %%xmm3, (%2,%0) \n\t"
- "movaps %%xmm4, 16(%1,%0) \n\t"
- "movaps %%xmm7, 16(%2,%0) \n\t"
- "jg 1b \n\t"
- :"+r"(i)
- :"r"(p), "r"(p + nloops), "r"(cptr)
- );
- p += nloops*2;
- } while (--j);
- cptr += nloops*2;
- nblocks >>= 1;
- nloops <<= 1;
- } while (nblocks != 0);
+void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
+{
+ int n = 1 << s->nbits;
+ int i;
+ for(i=0; i<n; i+=2) {
+ asm volatile(
+ "movaps %2, %%xmm0 \n"
+ "movlps %%xmm0, %0 \n"
+ "movhps %%xmm0, %1 \n"
+ :"=m"(s->tmp_buf[s->revtab[i]]),
+ "=m"(s->tmp_buf[s->revtab[i+1]])
+ :"m"(z[i])
+ );
+ }
+ memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
}
static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)