summaryrefslogtreecommitdiff
path: root/libavcodec/aarch64
diff options
context:
space:
mode:
authorJanne Grunau <janne-libav@jannau.net>2014-03-26 15:20:42 +0100
committerJanne Grunau <janne-libav@jannau.net>2014-04-22 19:35:40 +0200
commit650c4300d94aa9398ff1dd4f454bf39eaa285f62 (patch)
tree9f3063c9591be5412f48029d5a510ecc48ee804e /libavcodec/aarch64
parentf9157463dbcd2db8fe9504197c0c04d0d7d04f31 (diff)
aarch64: NEON float FFT
Approximately as fast as the ARM NEON version on Apple's A7.
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r--libavcodec/aarch64/Makefile2
-rw-r--r--libavcodec/aarch64/fft_init_aarch64.c37
-rw-r--r--libavcodec/aarch64/fft_neon.S442
3 files changed, 481 insertions, 0 deletions
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 757b499db0..01f618fc06 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -1,3 +1,4 @@
+OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
@@ -10,6 +11,7 @@ OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
+NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
aarch64/h264idct_neon.o
diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c
new file mode 100644
index 0000000000..caa5a0d90a
--- /dev/null
+++ b/libavcodec/aarch64/fft_init_aarch64.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/fft.h"
+
+void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+
+av_cold void ff_fft_init_aarch64(FFTContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ s->fft_permute = ff_fft_permute_neon;
+ s->fft_calc = ff_fft_calc_neon;
+ }
+}
diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
new file mode 100644
index 0000000000..5189bfb4ef
--- /dev/null
+++ b/libavcodec/aarch64/fft_neon.S
@@ -0,0 +1,442 @@
+/*
+ * ARM NEON optimised FFT
+ *
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2009 Naotoshi Nojiri
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define M_SQRT1_2 0.70710678118654752440
+
+.macro transpose d0, d1, s0, s1
+ trn1 \d0, \s0, \s1
+ trn2 \d1, \s0, \s1
+.endm
+
+
+function fft4_neon
+ ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
+
+ fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1
+ fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1
+
+ ext v16.8b, v2.8b, v3.8b, #4
+ ext v17.8b, v3.8b, v2.8b, #4
+
+ fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3
+ fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3
+
+ fadd v0.2s, v4.2s, v5.2s
+ fsub v2.2s, v4.2s, v5.2s
+ fadd v1.2s, v6.2s, v7.2s
+ fsub v3.2s, v6.2s, v7.2s
+
+ st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
+
+ ret
+endfunc
+
+function fft8_neon
+ mov x1, x0
+ ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
+ ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
+ ext v22.8b, v2.8b, v3.8b, #4
+ ext v23.8b, v3.8b, v2.8b, #4
+ fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
+ fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
+ fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
+ fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
+ rev64 v27.2s, v28.2s // ???
+ fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
+ fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
+ fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
+ ext v6.8b, v4.8b, v5.8b, #4
+ ext v7.8b, v5.8b, v4.8b, #4
+ fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
+ fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
+ fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
+ fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
+ fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
+ fadd v0.2s, v20.2s, v21.2s
+ fsub v2.2s, v20.2s, v21.2s
+ fadd v1.2s, v22.2s, v23.2s
+ rev64 v26.2s, v26.2s
+ rev64 v27.2s, v27.2s
+ fsub v3.2s, v22.2s, v23.2s
+ fsub v6.2s, v6.2s, v7.2s
+ fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
+ fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
+ fadd v7.2s, v4.2s, v5.2s
+ fsub v18.2s, v2.2s, v6.2s
+ ext v26.8b, v24.8b, v25.8b, #4
+ ext v27.8b, v25.8b, v24.8b, #4
+ fadd v2.2s, v2.2s, v6.2s
+ fsub v16.2s, v0.2s, v7.2s
+ fadd v5.2s, v25.2s, v24.2s
+ fsub v4.2s, v26.2s, v27.2s
+ fadd v0.2s, v0.2s, v7.2s
+ fsub v17.2s, v1.2s, v5.2s
+ fsub v19.2s, v3.2s, v4.2s
+ fadd v3.2s, v3.2s, v4.2s
+ fadd v1.2s, v1.2s, v5.2s
+
+ st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
+ st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1]
+
+ ret
+endfunc
+
+function fft16_neon
+ mov x1, x0
+ ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
+ ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
+ ext v22.8b, v2.8b, v3.8b, #4
+ ext v23.8b, v3.8b, v2.8b, #4
+ fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
+ fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
+ fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
+ fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
+ rev64 v27.2s, v28.2s // ???
+ fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
+ fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
+ fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
+ ext v6.8b, v4.8b, v5.8b, #4
+ ext v7.8b, v5.8b, v4.8b, #4
+ fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
+ fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
+ fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
+ fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
+ fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
+ fadd v0.2s, v20.2s, v21.2s
+ fsub v2.2s, v20.2s, v21.2s
+ fadd v1.2s, v22.2s, v23.2s
+ rev64 v26.2s, v26.2s
+ rev64 v27.2s, v27.2s
+ fsub v3.2s, v22.2s, v23.2s
+ fsub v6.2s, v6.2s, v7.2s
+ fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
+ fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
+ fadd v7.2s, v4.2s, v5.2s
+ fsub v18.2s, v2.2s, v6.2s
+ ld1 {v20.4s,v21.4s}, [x0], #32
+ ld1 {v22.4s,v23.4s}, [x0], #32
+ ext v26.8b, v24.8b, v25.8b, #4
+ ext v27.8b, v25.8b, v24.8b, #4
+ fadd v2.2s, v2.2s, v6.2s
+ fsub v16.2s, v0.2s, v7.2s
+ fadd v5.2s, v25.2s, v24.2s
+ fsub v4.2s, v26.2s, v27.2s
+ transpose v24.2d, v25.2d, v20.2d, v22.2d
+ transpose v26.2d, v27.2d, v21.2d, v23.2d
+ fadd v0.2s, v0.2s, v7.2s
+ fsub v17.2s, v1.2s, v5.2s
+ fsub v19.2s, v3.2s, v4.2s
+ fadd v3.2s, v3.2s, v4.2s
+ fadd v1.2s, v1.2s, v5.2s
+ ext v20.16b, v21.16b, v21.16b, #4
+ ext v21.16b, v23.16b, v23.16b, #4
+
+ zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]}
+ zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]}
+ zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]}
+ zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]}
+
+ // 2 x fft4
+ transpose v22.2d, v23.2d, v20.2d, v21.2d
+
+ fadd v4.4s, v24.4s, v25.4s
+ fadd v5.4s, v26.4s, v27.4s
+ fsub v6.4s, v24.4s, v25.4s
+ fsub v7.4s, v22.4s, v23.4s
+
+ ld1 {v23.4s}, [x14]
+
+ fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]}
+ fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]}
+ fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]}
+ fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]}
+
+ //fft_pass_neon_16
+ rev64 v7.4s, v25.4s
+ fmul v25.4s, v25.4s, v23.s[1]
+ fmul v7.4s, v7.4s, v29.4s
+ fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a}
+
+ zip1 v20.4s, v24.4s, v25.4s
+ zip2 v21.4s, v24.4s, v25.4s
+ fneg v22.4s, v20.4s
+ fadd v4.4s, v21.4s, v20.4s
+ fsub v6.4s, v20.4s, v21.4s // just the second half
+ fadd v5.4s, v21.4s, v22.4s // just the first half
+
+ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
+ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+ fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]}
+ fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]}
+ fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]}
+ fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]}
+
+//second half
+ rev64 v6.4s, v26.4s
+ fmul v26.4s, v26.4s, v23.s[2]
+ rev64 v7.4s, v27.4s
+ fmul v27.4s, v27.4s, v23.s[3]
+ fmul v6.4s, v6.4s, v29.4s
+ fmul v7.4s, v7.4s, v29.4s
+ fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6}
+ fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a}
+
+ zip1 v24.4s, v26.4s, v27.4s
+ zip2 v25.4s, v26.4s, v27.4s
+ fneg v26.4s, v24.4s
+ fadd v4.4s, v25.4s, v24.4s
+ fsub v6.4s, v24.4s, v25.4s // just the second half
+ fadd v5.4s, v25.4s, v26.4s // just the first half
+
+ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
+ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+ fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]}
+ fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]}
+ fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]}
+ fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]}
+
+ st1 {v16.4s,v17.4s}, [x1], #32
+ st1 {v18.4s,v19.4s}, [x1], #32
+ st1 {v20.4s,v21.4s}, [x1], #32
+ st1 {v22.4s,v23.4s}, [x1], #32
+
+ ret
+endfunc
+
+
+const trans4_float, align=4
+ .byte 0, 1, 2, 3
+ .byte 8, 9, 10, 11
+ .byte 4, 5, 6, 7
+ .byte 12, 13, 14, 15
+endconst
+
+const trans8_float, align=4
+ .byte 24, 25, 26, 27
+ .byte 0, 1, 2, 3
+ .byte 28, 29, 30, 31
+ .byte 4, 5, 6, 7
+endconst
+
+function fft_pass_neon
+ sub x6, x2, #1 // n - 1, loop counter
+ lsl x5, x2, #3 // 2 * n * sizeof FFTSample
+ lsl x1, x2, #4 // 2 * n * sizeof FFTComplex
+ add x5, x4, x5 // wim
+ add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex
+ add x2, x0, x2, lsl #5 // &z[o2]
+ add x3, x0, x3 // &z[o3]
+ add x1, x0, x1 // &z[o1]
+ ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
+ ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
+ ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
+ trn2 v25.2d, v20.2d, v22.2d
+ sub x5, x5, #4 // wim--
+ trn1 v24.2d, v20.2d, v22.2d
+ ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1]
+ rev64 v7.4s, v25.4s
+ fmul v25.4s, v25.4s, v4.s[1]
+ ld1 {v16.4s}, [x0] // {z[0],z[1]}
+ fmul v7.4s, v7.4s, v29.4s
+ ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]}
+ prfm pldl1keep, [x2, #16]
+ prfm pldl1keep, [x3, #16]
+ fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
+ prfm pldl1keep, [x0, #16]
+ prfm pldl1keep, [x1, #16]
+
+ zip1 v20.4s, v24.4s, v25.4s
+ zip2 v21.4s, v24.4s, v25.4s
+ fneg v22.4s, v20.4s
+ fadd v4.4s, v21.4s, v20.4s
+ fsub v6.4s, v20.4s, v21.4s // just the second half
+ fadd v5.4s, v21.4s, v22.4s // just the first half
+
+ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
+ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+ fadd v20.4s, v16.4s, v4.4s
+ fsub v22.4s, v16.4s, v4.4s
+ fadd v21.4s, v17.4s, v5.4s
+ st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
+ fsub v23.4s, v17.4s, v5.4s
+
+ st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
+ st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
+ st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
+1:
+ ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
+ ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
+ ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
+ transpose v26.2d, v27.2d, v20.2d, v22.2d
+ ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]}
+ rev64 v6.4s, v26.4s
+ fmul v26.4s, v26.4s, v4.s[0]
+ rev64 v7.4s, v27.4s
+ fmul v27.4s, v27.4s, v4.s[1]
+ fmul v6.4s, v6.4s, v29.4s
+ fmul v7.4s, v7.4s, v29.4s
+ ld1 {v16.4s},[x0] // {z[0],z[1]}
+ fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6}
+ fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
+ ld1 {v17.4s},[x1] // {z[o1],z[o1+1]}
+
+ subs x6, x6, #1 // n--
+
+ zip1 v20.4s, v26.4s, v27.4s
+ zip2 v21.4s, v26.4s, v27.4s
+ fneg v22.4s, v20.4s
+ fadd v4.4s, v21.4s, v20.4s
+ fsub v6.4s, v20.4s, v21.4s // just the second half
+ fadd v5.4s, v21.4s, v22.4s // just the first half
+
+ tbl v4.16b, {v4.16b}, v30.16b // trans4_float
+ tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+ fadd v20.4s, v16.4s, v4.4s
+ fsub v22.4s, v16.4s, v4.4s
+ fadd v21.4s, v17.4s, v5.4s
+ st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
+ fsub v23.4s, v17.4s, v5.4s
+
+ st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
+ st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
+ st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
+ b.ne 1b
+
+ ret
+endfunc
+
+.macro def_fft n, n2, n4
+function fft\n\()_neon align=6
+ sub sp, sp, #16
+ stp x28, x30, [sp]
+ add x28, x0, #\n4*2*8
+ bl fft\n2\()_neon
+ mov x0, x28
+ bl fft\n4\()_neon
+ add x0, x28, #\n4*1*8
+ bl fft\n4\()_neon
+ sub x0, x28, #\n4*2*8
+ ldp x28, x30, [sp], #16
+ movrel x4, X(ff_cos_\n)
+ mov x2, #\n4/2
+ b fft_pass_neon
+endfunc
+.endm
+
+ def_fft 32, 16, 8
+ def_fft 64, 32, 16
+ def_fft 128, 64, 32
+ def_fft 256, 128, 64
+ def_fft 512, 256, 128
+ def_fft 1024, 512, 256
+ def_fft 2048, 1024, 512
+ def_fft 4096, 2048, 1024
+ def_fft 8192, 4096, 2048
+ def_fft 16384, 8192, 4096
+ def_fft 32768, 16384, 8192
+ def_fft 65536, 32768, 16384
+
+function ff_fft_calc_neon, export=1
+ prfm pldl1keep, [x1]
+ movrel x10, trans4_float
+ ldr w2, [x0]
+ movrel x11, trans8_float
+ sub w2, w2, #2
+ movrel x3, fft_tab_neon
+ ld1 {v30.16b}, [x10]
+ mov x7, #-8
+ movrel x12, pmmp
+ ldr x3, [x3, x2, lsl #3]
+ movrel x13, mppm
+ movrel x14, X(ff_cos_16)
+ ld1 {v31.16b}, [x11]
+ mov x0, x1
+ ld1 {v29.4s}, [x12] // pmmp
+ ld1 {v28.4s}, [x13]
+ br x3
+endfunc
+
+function ff_fft_permute_neon, export=1
+ mov x6, #1
+ ldr w2, [x0] // nbits
+ ldr x3, [x0, #16] // tmp_buf
+ ldr x0, [x0, #8] // revtab
+ lsl x6, x6, x2
+ mov x2, x6
+1:
+ ld1 {v0.2s,v1.2s}, [x1], #16
+ ldr w4, [x0], #4
+ uxth w5, w4
+ lsr w4, w4, #16
+ add x5, x3, x5, lsl #3
+ add x4, x3, x4, lsl #3
+ st1 {v0.2s}, [x5]
+ st1 {v1.2s}, [x4]
+ subs x6, x6, #2
+ b.gt 1b
+
+ sub x1, x1, x2, lsl #3
+1:
+ ld1 {v0.4s,v1.4s}, [x3], #32
+ st1 {v0.4s,v1.4s}, [x1], #32
+ subs x2, x2, #4
+ b.gt 1b
+
+ ret
+endfunc
+
+const fft_tab_neon
+ .quad fft4_neon
+ .quad fft8_neon
+ .quad fft16_neon
+ .quad fft32_neon
+ .quad fft64_neon
+ .quad fft128_neon
+ .quad fft256_neon
+ .quad fft512_neon
+ .quad fft1024_neon
+ .quad fft2048_neon
+ .quad fft4096_neon
+ .quad fft8192_neon
+ .quad fft16384_neon
+ .quad fft32768_neon
+ .quad fft65536_neon
+endconst
+
+const pmmp, align=4
+ .float +1.0, -1.0, -1.0, +1.0
+endconst
+
+const mppm, align=4
+ .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+endconst