From 11ab1e409ff272c8c0bce62f48a3767546547c6c Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sat, 12 Feb 2011 11:48:16 +0000 Subject: FFT: factor a shuffle out of the inner loop and merge it into fft_permute. 6% faster SSE FFT on Conroe, 2.5% on Penryn. Signed-off-by: Janne Grunau (cherry picked from commit e6b1ed693ae4098e6b9eabf938fc31ec0b09b120) --- libavcodec/arm/fft_init_arm.c | 2 +- libavcodec/fft.c | 9 +++++-- libavcodec/fft.h | 5 +++- libavcodec/mdct.c | 4 +-- libavcodec/x86/fft.c | 1 + libavcodec/x86/fft_mmx.asm | 62 +++++++++++++++++++++---------------------- 6 files changed, 45 insertions(+), 38 deletions(-) (limited to 'libavcodec') diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c index 3f2d554413..dff0689566 100644 --- a/libavcodec/arm/fft_init_arm.c +++ b/libavcodec/arm/fft_init_arm.c @@ -44,7 +44,7 @@ av_cold void ff_fft_init_arm(FFTContext *s) s->imdct_calc = ff_imdct_calc_neon; s->imdct_half = ff_imdct_half_neon; s->mdct_calc = ff_mdct_calc_neon; - s->permutation = FF_MDCT_PERM_INTERLEAVE; + s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; } } diff --git a/libavcodec/fft.c b/libavcodec/fft.c index 3fd4d279cb..eade76a516 100644 --- a/libavcodec/fft.c +++ b/libavcodec/fft.c @@ -97,6 +97,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) if (!s->tmp_buf) goto fail; s->inverse = inverse; + s->fft_permutation = FF_FFT_PERM_DEFAULT; s->fft_permute = ff_fft_permute_c; s->fft_calc = ff_fft_calc_c; @@ -113,8 +114,12 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) for(j=4; j<=nbits; j++) { ff_init_ff_cos_tabs(j); } - for(i=0; irevtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i; + for(i=0; ifft_permutation == FF_FFT_PERM_SWAP_LSBS) + j = (j&~3) | ((j>>1)&1) | ((j<<1)&2); + s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j; + } return 0; fail: diff --git a/libavcodec/fft.h b/libavcodec/fft.h index 37cbfbf6a1..58a7f30ad1 100644 --- a/libavcodec/fft.h +++ b/libavcodec/fft.h @@ -44,7 +44,10 @@ struct FFTContext { void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); - int permutation; + int fft_permutation; +#define FF_FFT_PERM_DEFAULT 0 +#define FF_FFT_PERM_SWAP_LSBS 1 + int mdct_permutation; #define FF_MDCT_PERM_NONE 0 #define FF_MDCT_PERM_INTERLEAVE 1 }; diff --git a/libavcodec/mdct.c b/libavcodec/mdct.c index c511188d22..819f618115 100644 --- a/libavcodec/mdct.c +++ b/libavcodec/mdct.c @@ -71,7 +71,7 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale) s->mdct_bits = nbits; s->mdct_size = n; n4 = n >> 2; - s->permutation = FF_MDCT_PERM_NONE; + s->mdct_permutation = FF_MDCT_PERM_NONE; if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0) goto fail; @@ -80,7 +80,7 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale) if (!s->tcos) goto fail; - switch (s->permutation) { + switch (s->mdct_permutation) { case FF_MDCT_PERM_NONE: s->tsin = s->tcos + n4; tstep = 1; diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c index 771b1e6649..5ca341d5e4 100644 --- a/libavcodec/x86/fft.c +++ b/libavcodec/x86/fft.c @@ -30,6 +30,7 @@ av_cold void ff_fft_init_mmx(FFTContext *s) s->imdct_half = ff_imdct_half_sse; s->fft_permute = ff_fft_permute_sse; s->fft_calc = ff_fft_calc_sse; + s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; } else if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { /* 3DNowEx for K7 */ s->imdct_calc = ff_imdct_calc_3dn2; diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 1dcd629184..68e20df3b0 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -51,6 +51,7 @@ SECTION_RODATA %define M_SQRT1_2 0.70710678118654752440 ps_root2: times 4 dd M_SQRT1_2 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +ps_p1p1m1p1: dd 0, 0, 1<<31, 0 ps_m1p1: dd 1<<31, 0 %assign i 16 @@ -95,54 +96,51 @@ section .text align=16 SWAP %3, %6 %endmacro -; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} +; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %macro T4_SSE 3 mova %3, %1 - shufps %1, %2, 0x64 ; {r0,i0,r3,i2} - shufps %3, %2, 0xce ; {r1,i1,r2,i3} + addps %1, %2 ; {t1,t2,t6,t5} + subps %3, %2 ; {t3,t4,-t8,t7} + xorps %3, [ps_p1p1m1p1] mova %2, %1 - addps %1, %3 ; {t1,t2,t6,t5} - subps %2, %3 ; {t3,t4,t8,t7} + shufps %1, %3, 0x44 ; {t1,t2,t3,t4} + shufps %2, %3, 0xbe ; {t6,t5,t7,t8} mova %3, %1 - shufps %1, %2, 0x44 ; {t1,t2,t3,t4} - shufps %3, %2, 0xbe ; {t6,t5,t7,t8} + addps %1, %2 ; {r0,i0,r1,i1} + subps %3, %2 ; {r2,i2,r3,i3} mova %2, %1 - addps %1, %3 ; {r0,i0,r1,i1} - subps %2, %3 ; {r2,i2,r3,i3} - mova %3, %1 - shufps %1, %2, 0x88 ; {r0,r1,r2,r3} - shufps %3, %2, 0xdd ; {i0,i1,i2,i3} - SWAP %2, %3 + shufps %1, %3, 0x88 ; {r0,r1,r2,r3} + shufps %2, %3, 0xdd ; {i0,i1,i2,i3} %endmacro -%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 - mova %5, %3 - shufps %3, %4, 0x44 ; {r4,i4,r6,i6} - shufps %5, %4, 0xee ; {r5,i5,r7,i7} +; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} +%macro T8_SSE 6 mova %6, %3 - subps %3, %5 ; {r5,i5,r7,i7} - addps %6, %5 ; {t1,t2,t3,t4} - mova %5, %3 - shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} + subps %3, %4 ; {r5,i5,r7,i7} + addps %6, %4 ; {t1,t2,t3,t4} + mova %4, %3 + shufps %4, %4, 0xb1 ; {i5,r5,i7,r7} mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} - mulps %5, [ps_root2] - addps %3, %5 ; {t8,t7,ta,t9} - mova %5, %6 + mulps %4, [ps_root2] + addps %3, %4 ; {t8,t7,ta,t9} + mova %4, %6 shufps %6, %3, 0x36 ; {t3,t2,t9,t8} - shufps %5, %3, 0x9c ; {t1,t4,t7,ta} + shufps %4, %3, 0x9c ; {t1,t4,t7,ta} mova %3, %6 - addps %6, %5 ; {t1,t2,t9,ta} - subps %3, %5 ; {t6,t5,tc,tb} - mova %5, %6 + addps %6, %4 ; {t1,t2,t9,ta} + subps %3, %4 ; {t6,t5,tc,tb} + mova %4, %6 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} - shufps %5, %3, 0x8d ; {t2,ta,t6,tc} + shufps %4, %3, 0x8d ; {t2,ta,t6,tc} mova %3, %1 - mova %4, %2 + mova %5, %2 addps %1, %6 ; {r0,r1,r2,r3} - addps %2, %5 ; {i0,i1,i2,i3} + addps %2, %4 ; {i0,i1,i2,i3} subps %3, %6 ; {r4,r5,r6,r7} - subps %4, %5 ; {i4,i5,i6,i7} + subps %5, %4 ; {i4,i5,i6,i7} + SWAP %4, %5 %endmacro ; scheduled for cpu-bound sizes -- cgit v1.2.3