From 650c4300d94aa9398ff1dd4f454bf39eaa285f62 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 26 Mar 2014 15:20:42 +0100 Subject: aarch64: NEON float FFT Approximately as fast as the ARM NEON version on Apple's A7. --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/fft_init_aarch64.c | 37 +++ libavcodec/aarch64/fft_neon.S | 442 ++++++++++++++++++++++++++++++++++ 3 files changed, 481 insertions(+) create mode 100644 libavcodec/aarch64/fft_init_aarch64.c create mode 100644 libavcodec/aarch64/fft_neon.S (limited to 'libavcodec/aarch64') diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 757b499db0..01f618fc06 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -1,3 +1,4 @@ +OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o @@ -10,6 +11,7 @@ OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o +NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \ aarch64/h264idct_neon.o diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c new file mode 100644 index 0000000000..caa5a0d90a --- /dev/null +++ b/libavcodec/aarch64/fft_init_aarch64.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/fft.h" + +void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); +void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); + +av_cold void ff_fft_init_aarch64(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + s->fft_permute = ff_fft_permute_neon; + s->fft_calc = ff_fft_calc_neon; + } +} diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S new file mode 100644 index 0000000000..5189bfb4ef --- /dev/null +++ b/libavcodec/aarch64/fft_neon.S @@ -0,0 +1,442 @@ +/* + * ARM NEON optimised FFT + * + * Copyright (c) 2009 Mans Rullgard + * Copyright (c) 2009 Naotoshi Nojiri + * Copyright (c) 2014 Janne Grunau + * + * This algorithm (though not any of the implementation details) is + * based on libdjbfft by D. J. Bernstein. + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +#define M_SQRT1_2 0.70710678118654752440 + +.macro transpose d0, d1, s0, s1 + trn1 \d0, \s0, \s1 + trn2 \d1, \s0, \s1 +.endm + + +function fft4_neon + ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] + + fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1 + fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1 + + ext v16.8b, v2.8b, v3.8b, #4 + ext v17.8b, v3.8b, v2.8b, #4 + + fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3 + fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3 + + fadd v0.2s, v4.2s, v5.2s + fsub v2.2s, v4.2s, v5.2s + fadd v1.2s, v6.2s, v7.2s + fsub v3.2s, v6.2s, v7.2s + + st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0] + + ret +endfunc + +function fft8_neon + mov x1, x0 + ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 + ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] + ext v22.8b, v2.8b, v3.8b, #4 + ext v23.8b, v3.8b, v2.8b, #4 + fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 + fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 + fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 + fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 + rev64 v27.2s, v28.2s // ??? + fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 + fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 + fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w + ext v6.8b, v4.8b, v5.8b, #4 + ext v7.8b, v5.8b, v4.8b, #4 + fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w + fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 + fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 + fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w + fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w + fadd v0.2s, v20.2s, v21.2s + fsub v2.2s, v20.2s, v21.2s + fadd v1.2s, v22.2s, v23.2s + rev64 v26.2s, v26.2s + rev64 v27.2s, v27.2s + fsub v3.2s, v22.2s, v23.2s + fsub v6.2s, v6.2s, v7.2s + fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 + fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 + fadd v7.2s, v4.2s, v5.2s + fsub v18.2s, v2.2s, v6.2s + ext v26.8b, v24.8b, v25.8b, #4 + ext v27.8b, v25.8b, v24.8b, #4 + fadd v2.2s, v2.2s, v6.2s + fsub v16.2s, v0.2s, v7.2s + fadd v5.2s, v25.2s, v24.2s + fsub v4.2s, v26.2s, v27.2s + fadd v0.2s, v0.2s, v7.2s + fsub v17.2s, v1.2s, v5.2s + fsub v19.2s, v3.2s, v4.2s + fadd v3.2s, v3.2s, v4.2s + fadd v1.2s, v1.2s, v5.2s + + st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0] + st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1] + + ret +endfunc + +function fft16_neon + mov x1, x0 + ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32 + ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32 + ext v22.8b, v2.8b, v3.8b, #4 + ext v23.8b, v3.8b, v2.8b, #4 + fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5 + fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7 + fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5 + fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7 + rev64 v27.2s, v28.2s // ??? + fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1 + fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3 + fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w + ext v6.8b, v4.8b, v5.8b, #4 + ext v7.8b, v5.8b, v4.8b, #4 + fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w + fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2 + fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1 + fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w + fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w + fadd v0.2s, v20.2s, v21.2s + fsub v2.2s, v20.2s, v21.2s + fadd v1.2s, v22.2s, v23.2s + rev64 v26.2s, v26.2s + rev64 v27.2s, v27.2s + fsub v3.2s, v22.2s, v23.2s + fsub v6.2s, v6.2s, v7.2s + fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2 + fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6 + fadd v7.2s, v4.2s, v5.2s + fsub v18.2s, v2.2s, v6.2s + ld1 {v20.4s,v21.4s}, [x0], #32 + ld1 {v22.4s,v23.4s}, [x0], #32 + ext v26.8b, v24.8b, v25.8b, #4 + ext v27.8b, v25.8b, v24.8b, #4 + fadd v2.2s, v2.2s, v6.2s + fsub v16.2s, v0.2s, v7.2s + fadd v5.2s, v25.2s, v24.2s + fsub v4.2s, v26.2s, v27.2s + transpose v24.2d, v25.2d, v20.2d, v22.2d + transpose v26.2d, v27.2d, v21.2d, v23.2d + fadd v0.2s, v0.2s, v7.2s + fsub v17.2s, v1.2s, v5.2s + fsub v19.2s, v3.2s, v4.2s + fadd v3.2s, v3.2s, v4.2s + fadd v1.2s, v1.2s, v5.2s + ext v20.16b, v21.16b, v21.16b, #4 + ext v21.16b, v23.16b, v23.16b, #4 + + zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]} + zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]} + zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]} + zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]} + + // 2 x fft4 + transpose v22.2d, v23.2d, v20.2d, v21.2d + + fadd v4.4s, v24.4s, v25.4s + fadd v5.4s, v26.4s, v27.4s + fsub v6.4s, v24.4s, v25.4s + fsub v7.4s, v22.4s, v23.4s + + ld1 {v23.4s}, [x14] + + fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]} + fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]} + fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]} + fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]} + + //fft_pass_neon_16 + rev64 v7.4s, v25.4s + fmul v25.4s, v25.4s, v23.s[1] + fmul v7.4s, v7.4s, v29.4s + fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a} + + zip1 v20.4s, v24.4s, v25.4s + zip2 v21.4s, v24.4s, v25.4s + fneg v22.4s, v20.4s + fadd v4.4s, v21.4s, v20.4s + fsub v6.4s, v20.4s, v21.4s // just the second half + fadd v5.4s, v21.4s, v22.4s // just the first half + + tbl v4.16b, {v4.16b}, v30.16b // trans4_float + tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float + + fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]} + fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]} + fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]} + fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]} + +//second half + rev64 v6.4s, v26.4s + fmul v26.4s, v26.4s, v23.s[2] + rev64 v7.4s, v27.4s + fmul v27.4s, v27.4s, v23.s[3] + fmul v6.4s, v6.4s, v29.4s + fmul v7.4s, v7.4s, v29.4s + fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6} + fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a} + + zip1 v24.4s, v26.4s, v27.4s + zip2 v25.4s, v26.4s, v27.4s + fneg v26.4s, v24.4s + fadd v4.4s, v25.4s, v24.4s + fsub v6.4s, v24.4s, v25.4s // just the second half + fadd v5.4s, v25.4s, v26.4s // just the first half + + tbl v4.16b, {v4.16b}, v30.16b // trans4_float + tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float + + fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]} + fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]} + fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]} + fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]} + + st1 {v16.4s,v17.4s}, [x1], #32 + st1 {v18.4s,v19.4s}, [x1], #32 + st1 {v20.4s,v21.4s}, [x1], #32 + st1 {v22.4s,v23.4s}, [x1], #32 + + ret +endfunc + + +const trans4_float, align=4 + .byte 0, 1, 2, 3 + .byte 8, 9, 10, 11 + .byte 4, 5, 6, 7 + .byte 12, 13, 14, 15 +endconst + +const trans8_float, align=4 + .byte 24, 25, 26, 27 + .byte 0, 1, 2, 3 + .byte 28, 29, 30, 31 + .byte 4, 5, 6, 7 +endconst + +function fft_pass_neon + sub x6, x2, #1 // n - 1, loop counter + lsl x5, x2, #3 // 2 * n * sizeof FFTSample + lsl x1, x2, #4 // 2 * n * sizeof FFTComplex + add x5, x4, x5 // wim + add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex + add x2, x0, x2, lsl #5 // &z[o2] + add x3, x0, x3 // &z[o3] + add x1, x0, x1 // &z[o1] + ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} + ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} + ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} + trn2 v25.2d, v20.2d, v22.2d + sub x5, x5, #4 // wim-- + trn1 v24.2d, v20.2d, v22.2d + ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1] + rev64 v7.4s, v25.4s + fmul v25.4s, v25.4s, v4.s[1] + ld1 {v16.4s}, [x0] // {z[0],z[1]} + fmul v7.4s, v7.4s, v29.4s + ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]} + prfm pldl1keep, [x2, #16] + prfm pldl1keep, [x3, #16] + fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} + prfm pldl1keep, [x0, #16] + prfm pldl1keep, [x1, #16] + + zip1 v20.4s, v24.4s, v25.4s + zip2 v21.4s, v24.4s, v25.4s + fneg v22.4s, v20.4s + fadd v4.4s, v21.4s, v20.4s + fsub v6.4s, v20.4s, v21.4s // just the second half + fadd v5.4s, v21.4s, v22.4s // just the first half + + tbl v4.16b, {v4.16b}, v30.16b // trans4_float + tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float + + fadd v20.4s, v16.4s, v4.4s + fsub v22.4s, v16.4s, v4.4s + fadd v21.4s, v17.4s, v5.4s + st1 {v20.4s}, [x0], #16 // {z[0], z[1]} + fsub v23.4s, v17.4s, v5.4s + + st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} + st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} + st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} +1: + ld1 {v20.4s},[x2] // {z[o2],z[o2+1]} + ld1 {v22.4s},[x3] // {z[o3],z[o3+1]} + ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]} + transpose v26.2d, v27.2d, v20.2d, v22.2d + ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]} + rev64 v6.4s, v26.4s + fmul v26.4s, v26.4s, v4.s[0] + rev64 v7.4s, v27.4s + fmul v27.4s, v27.4s, v4.s[1] + fmul v6.4s, v6.4s, v29.4s + fmul v7.4s, v7.4s, v29.4s + ld1 {v16.4s},[x0] // {z[0],z[1]} + fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6} + fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a} + ld1 {v17.4s},[x1] // {z[o1],z[o1+1]} + + subs x6, x6, #1 // n-- + + zip1 v20.4s, v26.4s, v27.4s + zip2 v21.4s, v26.4s, v27.4s + fneg v22.4s, v20.4s + fadd v4.4s, v21.4s, v20.4s + fsub v6.4s, v20.4s, v21.4s // just the second half + fadd v5.4s, v21.4s, v22.4s // just the first half + + tbl v4.16b, {v4.16b}, v30.16b // trans4_float + tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float + + fadd v20.4s, v16.4s, v4.4s + fsub v22.4s, v16.4s, v4.4s + fadd v21.4s, v17.4s, v5.4s + st1 {v20.4s}, [x0], #16 // {z[0], z[1]} + fsub v23.4s, v17.4s, v5.4s + + st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]} + st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]} + st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]} + b.ne 1b + + ret +endfunc + +.macro def_fft n, n2, n4 +function fft\n\()_neon align=6 + sub sp, sp, #16 + stp x28, x30, [sp] + add x28, x0, #\n4*2*8 + bl fft\n2\()_neon + mov x0, x28 + bl fft\n4\()_neon + add x0, x28, #\n4*1*8 + bl fft\n4\()_neon + sub x0, x28, #\n4*2*8 + ldp x28, x30, [sp], #16 + movrel x4, X(ff_cos_\n) + mov x2, #\n4/2 + b fft_pass_neon +endfunc +.endm + + def_fft 32, 16, 8 + def_fft 64, 32, 16 + def_fft 128, 64, 32 + def_fft 256, 128, 64 + def_fft 512, 256, 128 + def_fft 1024, 512, 256 + def_fft 2048, 1024, 512 + def_fft 4096, 2048, 1024 + def_fft 8192, 4096, 2048 + def_fft 16384, 8192, 4096 + def_fft 32768, 16384, 8192 + def_fft 65536, 32768, 16384 + +function ff_fft_calc_neon, export=1 + prfm pldl1keep, [x1] + movrel x10, trans4_float + ldr w2, [x0] + movrel x11, trans8_float + sub w2, w2, #2 + movrel x3, fft_tab_neon + ld1 {v30.16b}, [x10] + mov x7, #-8 + movrel x12, pmmp + ldr x3, [x3, x2, lsl #3] + movrel x13, mppm + movrel x14, X(ff_cos_16) + ld1 {v31.16b}, [x11] + mov x0, x1 + ld1 {v29.4s}, [x12] // pmmp + ld1 {v28.4s}, [x13] + br x3 +endfunc + +function ff_fft_permute_neon, export=1 + mov x6, #1 + ldr w2, [x0] // nbits + ldr x3, [x0, #16] // tmp_buf + ldr x0, [x0, #8] // revtab + lsl x6, x6, x2 + mov x2, x6 +1: + ld1 {v0.2s,v1.2s}, [x1], #16 + ldr w4, [x0], #4 + uxth w5, w4 + lsr w4, w4, #16 + add x5, x3, x5, lsl #3 + add x4, x3, x4, lsl #3 + st1 {v0.2s}, [x5] + st1 {v1.2s}, [x4] + subs x6, x6, #2 + b.gt 1b + + sub x1, x1, x2, lsl #3 +1: + ld1 {v0.4s,v1.4s}, [x3], #32 + st1 {v0.4s,v1.4s}, [x1], #32 + subs x2, x2, #4 + b.gt 1b + + ret +endfunc + +const fft_tab_neon + .quad fft4_neon + .quad fft8_neon + .quad fft16_neon + .quad fft32_neon + .quad fft64_neon + .quad fft128_neon + .quad fft256_neon + .quad fft512_neon + .quad fft1024_neon + .quad fft2048_neon + .quad fft4096_neon + .quad fft8192_neon + .quad fft16384_neon + .quad fft32768_neon + .quad fft65536_neon +endconst + +const pmmp, align=4 + .float +1.0, -1.0, -1.0, +1.0 +endconst + +const mppm, align=4 + .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +endconst -- cgit v1.2.3