From 3d5d46233cd81f78138a6d7418d480af04d3f6c8 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Sun, 1 Feb 2015 11:08:17 +0100
Subject: opus: Factor out imdct15 into a standalone component

It will be reused by the AAC decoder.
---
 libavcodec/aarch64/Makefile          |   4 +-
 libavcodec/aarch64/imdct15_init.c    |  46 +++
 libavcodec/aarch64/imdct15_neon.S    | 647 +++++++++++++++++++++++++++++++++++
 libavcodec/aarch64/opus_imdct_init.c |  45 ---
 libavcodec/aarch64/opus_imdct_neon.S | 647 -----------------------------------
 5 files changed, 695 insertions(+), 694 deletions(-)
 create mode 100644 libavcodec/aarch64/imdct15_init.c
 create mode 100644 libavcodec/aarch64/imdct15_neon.S
 delete mode 100644 libavcodec/aarch64/opus_imdct_init.c
 delete mode 100644 libavcodec/aarch64/opus_imdct_neon.S

(limited to 'libavcodec/aarch64')

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 964428e35d..2afff297dc 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -3,11 +3,11 @@ OBJS-$(CONFIG_H264CHROMA)               += aarch64/h264chroma_init_aarch64.o
 OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
+OBJS-$(CONFIG_IMDCT15)                  += aarch64/imdct15_init.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
 
-OBJS-$(CONFIG_OPUS_DECODER)             += aarch64/opus_imdct_init.o
 OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
 OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o
 OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
@@ -21,8 +21,8 @@ NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264dsp_neon.o              \
 NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
                                            aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
+NEON-OBJS-$(CONFIG_IMDCT15)             += aarch64/imdct15_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
 
-NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opus_imdct_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
diff --git a/libavcodec/aarch64/imdct15_init.c b/libavcodec/aarch64/imdct15_init.c
new file mode 100644
index 0000000000..38018f2b4a
--- /dev/null
+++ b/libavcodec/aarch64/imdct15_init.c
@@ -0,0 +1,46 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/internal.h"
+
+#include "libavcodec/imdct15.h"
+
+#include "asm-offsets.h"
+
+AV_CHECK_OFFSET(IMDCT15Context, exptab,         CELT_EXPTAB);
+AV_CHECK_OFFSET(IMDCT15Context, fft_n,          CELT_FFT_N);
+AV_CHECK_OFFSET(IMDCT15Context, len2,           CELT_LEN2);
+AV_CHECK_OFFSET(IMDCT15Context, len4,           CELT_LEN4);
+AV_CHECK_OFFSET(IMDCT15Context, tmp,            CELT_TMP);
+AV_CHECK_OFFSET(IMDCT15Context, twiddle_exptab, CELT_TWIDDLE);
+
+void ff_celt_imdct_half_neon(IMDCT15Context *s, float *dst, const float *src,
+                             ptrdiff_t stride, float scale);
+
+void ff_imdct15_init_aarch64(IMDCT15Context *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->imdct_half = ff_celt_imdct_half_neon;
+    }
+}
diff --git a/libavcodec/aarch64/imdct15_neon.S b/libavcodec/aarch64/imdct15_neon.S
new file mode 100644
index 0000000000..d99edf4108
--- /dev/null
+++ b/libavcodec/aarch64/imdct15_neon.S
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#include "asm-offsets.h"
+
+.macro shuffle a, b, c, d
+const shuffle_\a\b\c\d, align=4
+        .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
+        .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
+        .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
+        .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
+endconst
+.endm
+
+shuffle 0, 2, 1, 3
+shuffle 1, 0, 3, 2
+shuffle 2, 3, 0, 1
+shuffle 3, 1, 2, 0
+
+
+function fft5_neon
+        lsl             x2,  x2,  #3
+        ld1             {v24.2s},         [x1],  x2
+        ld2             {v25.s,v26.s}[0], [x1],  x2
+        ld2             {v25.s,v26.s}[1], [x1],  x2
+        ld2             {v25.s,v26.s}[2], [x1],  x2
+        ld2             {v25.s,v26.s}[3], [x1]
+        dup             v6.4s,  v24.s[0]
+        dup             v7.4s,  v24.s[1]
+
+        faddp           v0.4s,  v25.4s, v26.4s
+        // z[][0], z[][3]
+        fmul            v16.4s, v25.4s, v15.s[0] // rr
+        fmul            v17.4s, v25.4s, v15.s[1] // ri
+        fmul            v18.4s, v26.4s, v15.s[0] // ir
+        fmul            v19.4s, v26.4s, v15.s[1] // ii
+        faddp           v0.4s,  v0.4s,  v0.4s
+        // z[][1], z[][2]
+        fmul            v20.4s, v25.4s, v15.s[2] // rr
+        fmul            v21.4s, v25.4s, v15.s[3] // ri
+        fmul            v22.4s, v26.4s, v15.s[2] // ir
+        fmul            v23.4s, v26.4s, v15.s[3] // ii
+        fadd            v0.2s,  v24.2s, v0.2s   // out[0]
+
+        // z[0123][0], z[0123][3]
+        fsub            v24.4s, v16.4s, v19.4s  //    (c).re =  rr - ii;
+        fadd            v27.4s, v16.4s, v19.4s  //    (d).re =  rr + ii;
+        ld1             {v16.16b},  [x11]
+        ld1             {v19.16b},  [x14]
+        fadd            v28.4s, v17.4s, v18.4s  //    (c).im =  ri + ir;
+        fsub            v31.4s, v18.4s, v17.4s  //    (d).im = -ri + ir;
+        ld1             {v17.16b},  [x12]
+        // z[0123][1], z[0123][2]
+        fsub            v25.4s, v20.4s, v23.4s  //    (c).re =  rr - ii;
+        fadd            v26.4s, v20.4s, v23.4s  //    (d).re =  rr + ii;
+        ld1             {v18.16b},  [x13]
+        fadd            v29.4s, v21.4s, v22.4s  //    (c).im =  ri + ir;
+        fsub            v30.4s, v22.4s, v21.4s  //    (d).im = -ri + ir;
+
+        //real
+        tbl             v20.16b, {v24.16b}, v16.16b
+        tbl             v21.16b, {v25.16b}, v17.16b
+        tbl             v22.16b, {v26.16b}, v18.16b
+        tbl             v23.16b, {v27.16b}, v19.16b
+        //imag
+        tbl             v16.16b, {v28.16b}, v16.16b
+        tbl             v17.16b, {v29.16b}, v17.16b
+        tbl             v18.16b, {v30.16b}, v18.16b
+        tbl             v19.16b, {v31.16b}, v19.16b
+
+        fadd            v6.4s,  v6.4s,  v20.4s
+        fadd            v22.4s, v22.4s, v23.4s
+        fadd            v7.4s,  v7.4s,  v16.4s
+        fadd            v18.4s, v18.4s, v19.4s
+
+        fadd            v21.4s, v21.4s, v22.4s
+        fadd            v17.4s, v17.4s, v18.4s
+        fadd            v6.4s,  v6.4s,  v21.4s
+        fadd            v7.4s,  v7.4s,  v17.4s
+
+        ret
+endfunc
+
+function fft15_neon
+        mov             x8,  x1
+        mov             x9,  x30
+        add             x2,  x3,  x3,  lsl #1   // 3 * stride
+
+        add             x1,  x8,  x3,  lsl #3   // in + 1 * stride
+        bl              fft5_neon
+        mov             v1.8b,   v0.8b
+        mov             v2.16b,  v6.16b
+        mov             v3.16b,  v7.16b
+
+        add             x1,  x8,  x3,  lsl #4   // in + 2 * stride
+        add             x2,  x3,  x3,  lsl #1   // 3 * stride
+        bl              fft5_neon
+        zip1            v1.4s,   v1.4s,  v0.4s
+        mov             v4.16b,  v6.16b
+        mov             v5.16b,  v7.16b
+
+        mov             x1,  x8                 // in + 0 * stride
+        add             x2,  x3,  x3,  lsl #1   // 3 * stride
+        bl              fft5_neon
+
+        faddp           v20.4s, v1.4s,  v1.4s
+
+        ext             v18.16b, v8.16b,  v8.16b,  #4
+        ext             v19.16b, v9.16b,  v9.16b,  #4
+        mov             v16.16b, v6.16b
+        mov             v17.16b, v7.16b
+        fadd            v20.2s, v20.2s, v0.2s
+
+        uzp1            v18.4s, v18.4s, v10.4s  // exp[2,4,6,8].re
+        uzp1            v19.4s, v19.4s, v11.4s  // exp[2,4,6,8].im
+
+        st1             {v20.2s},  [x0], #8     // out[0]
+
+        fmla            v16.4s, v2.4s,  v8.4s
+        fmls            v16.4s, v3.4s,  v9.4s
+
+        fmla            v17.4s, v2.4s,  v9.4s
+        fmla            v17.4s, v3.4s,  v8.4s
+
+        fmla            v16.4s, v4.4s,  v18.4s
+        fmls            v16.4s, v5.4s,  v19.4s
+
+        fmla            v17.4s, v4.4s,  v19.4s
+        fmla            v17.4s, v5.4s,  v18.4s
+
+        zip1            v18.4s, v16.4s, v17.4s
+        zip2            v19.4s, v16.4s, v17.4s
+
+        rev64           v31.4s, v14.4s
+        trn1            v28.2d, v1.2d,  v1.2d
+        trn2            v29.2d, v1.2d,  v1.2d
+        zip1            v30.2d, v14.2d, v31.2d
+        zip2            v31.2d, v14.2d, v31.2d
+
+        st1             {v18.4s,v19.4s},  [x0], #32 // out[1-4]
+
+        fmul            v16.4s, v28.4s, v30.4s
+        fmul            v17.4s, v29.4s, v30.4s
+        fmls            v16.4s, v29.4s, v31.4s
+        fmla            v17.4s, v28.4s, v31.4s
+        faddp           v16.4s, v16.4s, v16.4s
+        faddp           v17.4s, v17.4s, v17.4s
+        zip1            v18.2s, v16.2s, v17.2s
+        zip2            v19.2s, v16.2s, v17.2s
+
+        fadd            v18.2s, v18.2s, v0.2s
+        fadd            v0.2s,  v19.2s, v0.2s
+
+        ext             v30.16b, v12.16b, v12.16b, #4
+        ext             v31.16b, v13.16b, v13.16b, #4
+        mov             v16.16b, v6.16b
+        mov             v17.16b, v7.16b
+
+        uzp1            v30.4s, v30.4s, v8.4s
+        uzp1            v31.4s, v31.4s, v9.4s
+
+        st1             {v18.2s},  [x0], #8     // out[5]
+
+        fmla            v16.4s, v2.4s,  v10.4s
+        fmls            v16.4s, v3.4s,  v11.4s
+
+        fmla            v17.4s, v2.4s,  v11.4s
+        fmla            v17.4s, v3.4s,  v10.4s
+
+        fmla            v16.4s, v4.4s,  v30.4s
+        fmls            v16.4s, v5.4s,  v31.4s
+
+        fmla            v17.4s, v4.4s,  v31.4s
+        fmla            v17.4s, v5.4s,  v30.4s
+
+        zip1            v18.4s, v16.4s, v17.4s
+        zip2            v19.4s, v16.4s, v17.4s
+
+        ext             v30.16b, v10.16b, v10.16b, #4
+        ext             v31.16b, v11.16b, v11.16b, #4
+
+        fmla            v6.4s,  v2.4s,  v12.4s
+        fmls            v6.4s,  v3.4s,  v13.4s
+
+        st1             {v18.4s,v19.4s},  [x0], #32 // out[6-9]
+
+        uzp1            v30.4s, v30.4s, v12.4s
+        uzp1            v31.4s, v31.4s, v13.4s
+
+        fmla            v7.4s,  v2.4s,  v13.4s
+        fmla            v7.4s,  v3.4s,  v12.4s
+
+        st1             {v0.2s},  [x0], #8     // out[10]
+
+        fmla            v6.4s,  v4.4s,  v30.4s
+        fmls            v6.4s,  v5.4s,  v31.4s
+
+        fmla            v7.4s,  v4.4s,  v31.4s
+        fmla            v7.4s,  v5.4s,  v30.4s
+
+        zip1            v18.4s, v6.4s,  v7.4s
+        zip2            v19.4s, v6.4s,  v7.4s
+
+        st1             {v18.4s,v19.4s},  [x0], #32 // out[11-14]
+
+        ret             x9
+endfunc
+
+// x0: out, x1: out+len2, x2: exptab, x3: len2
+function fft15_pass
+        ands            x6,  x3,  #3
+        mov             x4,  x0
+        mov             x5,  x1
+        b.eq            9f
+        ld1             {v0.2s},  [x0], #8
+        ld1             {v1.2s},  [x1], #8
+        sub             x3,  x3,  x6
+        subs            x6,  x6,  #1
+        fadd            v2.2s,  v0.2s,  v1.2s
+        fsub            v3.2s,  v0.2s,  v1.2s
+        add             x2,  x2,  #8
+        st1             {v2.2s},  [x4], #8
+        st1             {v3.2s},  [x5], #8
+        b.eq            9f
+1:
+        subs            x6,  x6,  #1
+        ldp             s4,  s5,  [x2], #8
+        ldp             s2,  s3,  [x1], #8
+        ldp             s0,  s1,  [x0], #8
+
+        fmul            s6,  s2,  s4
+        fmul            s7,  s2,  s5
+        fmls            s6,  s3,  v5.s[0]
+        fmla            s7,  s3,  v4.s[0]
+
+        fsub            s2,  s0,  s6
+        fsub            s3,  s1,  s7
+        fadd            s0,  s0,  s6
+        fadd            s1,  s1,  s7
+
+        stp             s2,  s3,  [x5], #8
+        stp             s0,  s1,  [x4], #8
+        b.gt            1b
+9:
+        ld1             {v4.4s,v5.4s}, [x2],  #32
+        ld2             {v2.4s,v3.4s}, [x1],  #32
+        uzp1            v6.4s,  v4.4s,  v5.4s
+        uzp2            v7.4s,  v4.4s,  v5.4s
+        ld2             {v0.4s,v1.4s}, [x0],  #32
+8:
+        subs            x3,  x3,  #8
+
+        fmul            v4.4s,  v2.4s,  v6.4s
+        fmul            v5.4s,  v2.4s,  v7.4s
+        b.lt            4f
+
+        ld1             {v18.4s,v19.4s}, [x2],  #32
+
+        fmls            v4.4s,  v3.4s,  v7.4s
+        fmla            v5.4s,  v3.4s,  v6.4s
+
+        ld2             {v22.4s,v23.4s}, [x1],  #32
+
+        fsub            v2.4s,  v0.4s,  v4.4s
+        fadd            v0.4s,  v0.4s,  v4.4s
+        fsub            v3.4s,  v1.4s,  v5.4s
+        fadd            v1.4s,  v1.4s,  v5.4s
+
+        uzp1            v16.4s, v18.4s, v19.4s
+        uzp2            v17.4s, v18.4s, v19.4s
+
+        st2             {v2.4s,v3.4s}, [x5],  #32
+        st2             {v0.4s,v1.4s}, [x4],  #32
+        ld2             {v20.4s,v21.4s}, [x0],  #32
+
+        fmul            v18.4s, v22.4s, v16.4s
+        fmul            v19.4s, v22.4s, v17.4s
+        b.eq            0f
+
+        ld1             {v4.4s,v5.4s}, [x2],  #32
+
+        fmls            v18.4s, v23.4s, v17.4s
+        fmla            v19.4s, v23.4s, v16.4s
+
+        ld2             {v2.4s,v3.4s}, [x1],  #32
+
+        fsub            v22.4s, v20.4s, v18.4s
+        fadd            v20.4s, v20.4s, v18.4s
+        fsub            v23.4s, v21.4s, v19.4s
+        fadd            v21.4s, v21.4s, v19.4s
+
+        uzp1            v6.4s,  v4.4s,  v5.4s
+        uzp2            v7.4s,  v4.4s,  v5.4s
+
+        st2             {v22.4s,v23.4s}, [x5],  #32
+        st2             {v20.4s,v21.4s}, [x4],  #32
+        ld2             {v0.4s,v1.4s}, [x0],  #32
+
+        b               8b
+4:
+        fmls            v4.4s,  v3.4s,  v7.4s
+        fmla            v5.4s,  v3.4s,  v6.4s
+
+        fsub            v2.4s,  v0.4s,  v4.4s
+        fadd            v0.4s,  v0.4s,  v4.4s
+        fsub            v3.4s,  v1.4s,  v5.4s
+        fadd            v1.4s,  v1.4s,  v5.4s
+
+        st2             {v2.4s,v3.4s}, [x5],  #32
+        st2             {v0.4s,v1.4s}, [x4],  #32
+
+        ret
+0:
+        fmls            v18.4s, v23.4s, v17.4s
+        fmla            v19.4s, v23.4s, v16.4s
+
+        fsub            v22.4s, v20.4s, v18.4s
+        fadd            v20.4s, v20.4s, v18.4s
+        fsub            v23.4s, v21.4s, v19.4s
+        fadd            v21.4s, v21.4s, v19.4s
+
+        st2             {v22.4s,v23.4s}, [x5],  #32
+        st2             {v20.4s,v21.4s}, [x4],  #32
+
+        ret
+endfunc
+
+function fft30_neon, align=6
+        sub             sp,  sp,  #0x20
+        stp             x20, x21, [sp]
+        stp             x22, x30, [sp, #0x10]
+        mov             x21, x1
+        mov             x22, x2
+        mov             x20, x4
+        mov             x0,  x21
+        mov             x1,  x22
+        lsl             x3,  x20, #1
+        bl              fft15_neon
+
+        add             x0,  x21, #15*8
+        add             x1,  x22, x20,  lsl #3
+        lsl             x3,  x20, #1
+        bl              fft15_neon
+
+        ldr             x2,  [x10, #(CELT_EXPTAB + 8)]  // s->exptab[1]
+        add             x0,  x21, #0
+        add             x1,  x21, #15*8
+        mov             x3,  #15
+        ldp             x20, x21, [sp]
+        ldp             x22, x30, [sp, #0x10]
+        add             sp,  sp,  #0x20
+        b               fft15_pass
+endfunc
+
+.macro  def_fft n, n2
+function fft\n\()_neon, align=6
+        sub             sp,  sp,  #0x30
+        stp             x20, x21, [sp]
+        stp             x22, x30, [sp, #0x10]
+        stp             x23, x24, [sp, #0x20]
+        mov             x21, x1
+        mov             x22, x2
+        mov             x23, x3
+        mov             x20, x4
+        sub             x3,  x3,  #1
+        lsl             x4,  x4,  #1
+        bl              fft\n2\()_neon
+
+        add             x1,  x21, #(\n2 * 8)
+        add             x2,  x22, x20, lsl #3
+        sub             x3,  x23, #1
+        lsl             x4,  x20, #1
+        bl              fft\n2\()_neon
+
+        add             x5,  x10, #CELT_EXPTAB
+        mov             x0,  x21
+        ldr             x2,  [x5,  x23, lsl #3] // s->exptab[N]
+        add             x1,  x21, #(\n2 * 8)
+        mov             x3,  #\n2
+        ldp             x20, x21, [sp]
+        ldp             x22, x30, [sp, #0x10]
+        ldp             x23, x24, [sp, #0x20]
+        add             sp,  sp,  #0x30
+        b               fft15_pass
+endfunc
+.endm
+
+        def_fft    60,  30
+        def_fft   120,  60
+        def_fft   240, 120
+        def_fft   480, 240
+        def_fft   960, 480
+
+function fft_b15_calc_neon
+        sub             sp,  sp,  #0x50
+        ldr             x8,  [x0,  #CELT_EXPTAB]    // s->exptab[0]
+        movrel          x6,  fact5
+        movrel          x11, shuffle_0213
+        movrel          x12, shuffle_1032
+        movrel          x13, shuffle_2301
+        movrel          x14, shuffle_3120
+        add             x8,  x8,  #8
+        movrel          x5,  fft_tab_neon
+        stp             x20, x30, [sp]
+        stp             d8,  d9,  [sp, #0x10]
+        stp             d10, d11, [sp, #0x20]
+        stp             d12, d13, [sp, #0x30]
+        stp             d14, d15, [sp, #0x40]
+        ld1             {v15.4s}, [x6]
+        ld1             {v0.4s,v1.4s},   [x8],  #32
+        ld1             {v6.2s},  [x8],  #8
+        ld1             {v2.4s,v3.4s},   [x8],  #32
+        ld1             {v7.2s},  [x8],  #8
+        ld1             {v4.4s,v5.4s},   [x8],  #32
+        uzp1            v8.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].re
+        uzp2            v9.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].im
+        uzp1            v10.4s, v2.4s,  v3.4s   // exp[ 6 -  9].re
+        uzp2            v11.4s, v2.4s,  v3.4s   // exp[ 6 -  9].im
+        uzp1            v12.4s, v4.4s,  v5.4s   // exp[11 - 14].re
+        uzp2            v13.4s, v4.4s,  v5.4s   // exp[11 - 14].im
+        zip1            v14.4s, v6.4s,  v7.4s   // exp[5,10].re/exp[5,10].im
+        add             x5,  x5,  x3,  lsl #3
+        ldr             x5,  [x5]
+        mov             x10, x0
+        blr             x5
+        ldp             x20, x30, [sp]
+        ldp             d8,  d9,  [sp, #0x10]
+        ldp             d10, d11, [sp, #0x20]
+        ldp             d12, d13, [sp, #0x30]
+        ldp             d14, d15, [sp, #0x40]
+        add             sp,  sp,  #0x50
+        ret
+endfunc
+
+const   fft_tab_neon, relocate=1
+        .quad fft15_neon
+        .quad fft30_neon
+        .quad fft60_neon
+        .quad fft120_neon
+        .quad fft240_neon
+        .quad fft480_neon
+        .quad fft960_neon
+endconst
+
+function ff_celt_imdct_half_neon, export=1
+        sub             sp,  sp,  #0x20
+        stp             x21, x30, [sp]
+        str             s0, [sp, #0x10]
+
+        ldp             w5,  w6,  [x0,  #CELT_LEN2] // CELT_LEN4
+        mov             x10, x0
+        mov             x21, x1
+        sub             w5,  w5,  #1
+        lsl             x7,  x3,  #3            //  2 * stride * sizeof(float)
+        sub             x8,  xzr, x3,  lsl #3   // -2 * stride * sizeof(float)
+        mul             x5,  x5,  x3
+        ldp             x9,  x10, [x0,  #CELT_TMP]  // CELT_TWIDDLE
+        ldr             w3,  [x0, #CELT_FFT_N]
+        add             x5,  x2,  x5,  lsl #2
+        mov             x11, x9
+
+        sub             w6,  w6,  #4
+        ld1             {v0.s}[0],  [x5], x8
+        ld1             {v1.s}[0],  [x2], x7
+        ld1             {v4.4s,v5.4s}, [x10], #32
+        ld1             {v0.s}[1],  [x5], x8
+        ld1             {v1.s}[1],  [x2], x7
+        uzp1            v2.4s,  v4.4s,  v5.4s
+        ld1             {v0.s}[2],  [x5], x8
+        ld1             {v1.s}[2],  [x2], x7
+        uzp2            v3.4s,  v4.4s,  v5.4s
+        ld1             {v0.s}[3],  [x5], x8
+        ld1             {v1.s}[3],  [x2], x7
+1:
+        subs            w6,  w6,  #4
+
+        ld1             {v20.s}[0], [x5], x8
+        ld1             {v21.s}[0], [x2], x7
+        ld1             {v4.4s,v5.4s}, [x10], #32
+
+        fmul            v6.4s,  v0.4s,  v2.4s
+        fmul            v7.4s,  v0.4s,  v3.4s
+
+        ld1             {v20.s}[1], [x5], x8
+        ld1             {v21.s}[1], [x2], x7
+
+        fmls            v6.4s,  v1.4s,  v3.4s
+        fmla            v7.4s,  v1.4s,  v2.4s
+
+        ld1             {v20.s}[2], [x5], x8
+        ld1             {v21.s}[2], [x2], x7
+
+        uzp1            v2.4s,  v4.4s,  v5.4s
+        uzp2            v3.4s,  v4.4s,  v5.4s
+        ld1             {v20.s}[3], [x5], x8
+        ld1             {v21.s}[3], [x2], x7
+
+        zip1            v4.4s,  v6.4s,  v7.4s
+        zip2            v5.4s,  v6.4s,  v7.4s
+
+        fmul            v6.4s,  v20.4s, v2.4s
+        fmul            v7.4s,  v20.4s, v3.4s
+
+        st1             {v4.4s,v5.4s}, [x9], #32
+
+        fmls            v6.4s,  v21.4s, v3.4s
+        fmla            v7.4s,  v21.4s, v2.4s
+
+        b.eq            3f
+
+        subs            w6,  w6,  #4
+        ld1             {v4.4s,v5.4s}, [x10], #32
+        ld1             {v0.s}[0],  [x5], x8
+        ld1             {v1.s}[0],  [x2], x7
+        uzp1            v2.4s,  v4.4s,  v5.4s
+        ld1             {v0.s}[1],  [x5], x8
+        ld1             {v1.s}[1],  [x2], x7
+        uzp2            v3.4s,  v4.4s,  v5.4s
+        ld1             {v0.s}[2],  [x5], x8
+        ld1             {v1.s}[2],  [x2], x7
+        zip1            v4.4s,  v6.4s,  v7.4s
+        zip2            v5.4s,  v6.4s,  v7.4s
+        ld1             {v0.s}[3],  [x5], x8
+        ld1             {v1.s}[3],  [x2], x7
+
+        st1             {v4.4s,v5.4s}, [x9], #32
+
+        b.gt            1b
+
+        fmul            v6.4s,  v0.4s,  v2.4s
+        fmul            v7.4s,  v0.4s,  v3.4s
+        fmls            v6.4s,  v1.4s,  v3.4s
+        fmla            v7.4s,  v1.4s,  v2.4s
+3:
+        zip1            v4.4s,  v6.4s,  v7.4s
+        zip2            v5.4s,  v6.4s,  v7.4s
+        st1             {v4.4s,v5.4s}, [x9], #32
+
+        mov             x2,  x11
+        mov             x4,  #1
+
+        bl              fft_b15_calc_neon
+
+        ldr             w5,  [x10, #CELT_LEN4]
+        ldr             x6,  [x10, #CELT_TWIDDLE]
+        ldr             s31, [sp, #0x10]
+
+        add             x1,  x21, x5,  lsl #2
+        add             x3,  x6,  x5,  lsl #2
+        sub             x0,  x1,  #16
+        sub             x2,  x3,  #16
+        mov             x8,  #-16
+        mov             x7,  #16
+        mov             x10, x0
+        mov             x11, x1
+
+        sub             w5,  w5,  #4
+
+        ld1             {v0.4s},  [x0], x8
+        ld1             {v1.4s},  [x1], x7
+        ld1             {v2.4s},  [x2], x8
+        ld1             {v3.4s},  [x3], x7
+
+        uzp1            v4.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].re
+        uzp2            v6.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].im
+
+        uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
+        uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
+
+        fmul            v1.4s,  v6.4s,  v5.4s
+        fmul            v0.4s,  v6.4s,  v7.4s
+2:
+        subs            w5,  w5,  #4
+
+        ld1             {v20.4s}, [x0], x8
+
+        fmla            v1.4s,  v4.4s,  v7.4s
+        fmls            v0.4s,  v4.4s,  v5.4s
+
+        ld1             {v21.4s}, [x1], x7
+
+        ext             v1.16b, v1.16b, v1.16b, #8
+        fmul            v0.4s,  v0.4s,  v31.s[0]
+
+        ld1             {v2.4s},  [x2], x8
+
+        rev64           v1.4s,  v1.4s
+        fmul            v1.4s,  v1.4s,  v31.s[0]
+
+        ld1             {v3.4s},  [x3], x7
+
+        zip1            v5.4s,  v0.4s,  v1.4s
+        zip2            v7.4s,  v0.4s,  v1.4s
+
+        uzp1            v4.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].re
+        uzp2            v6.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].im
+
+        st1             {v5.4s},  [x10], x8
+        st1             {v7.4s},  [x11], x7
+
+        uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
+        uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
+
+        fmul            v1.4s,  v6.4s,  v5.4s
+        fmul            v0.4s,  v6.4s,  v7.4s
+        b.gt            2b
+
+        fmla            v1.4s,  v4.4s,  v7.4s
+        fmls            v0.4s,  v4.4s,  v5.4s
+        ext             v1.16b, v1.16b, v1.16b, #8
+        fmul            v0.4s,  v0.4s,  v31.s[0]
+        rev64           v1.4s,  v1.4s
+        fmul            v1.4s,  v1.4s,  v31.s[0]
+        zip1            v5.4s,  v0.4s,  v1.4s
+        zip2            v7.4s,  v0.4s,  v1.4s
+        st1             {v5.4s},  [x10], x8
+        st1             {v7.4s},  [x11], x7
+
+        ldp             x21, x30, [sp]
+        add             sp,  sp,  #0x20
+        ret
+endfunc
+
+// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
+const   fact5,          align=4
+        .float           0.30901699437494745, 0.95105651629515353
+        .float          -0.80901699437494734, 0.58778525229247325
+endconst
diff --git a/libavcodec/aarch64/opus_imdct_init.c b/libavcodec/aarch64/opus_imdct_init.c
deleted file mode 100644
index 1a776dca00..0000000000
--- a/libavcodec/aarch64/opus_imdct_init.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-
-#include "libavutil/cpu.h"
-#include "libavutil/aarch64/cpu.h"
-#include "libavutil/internal.h"
-#include "libavcodec/opus_imdct.h"
-
-#include "asm-offsets.h"
-
-AV_CHECK_OFFSET(CeltIMDCTContext, exptab,         CELT_EXPTAB);
-AV_CHECK_OFFSET(CeltIMDCTContext, fft_n,          CELT_FFT_N);
-AV_CHECK_OFFSET(CeltIMDCTContext, len2,           CELT_LEN2);
-AV_CHECK_OFFSET(CeltIMDCTContext, len4,           CELT_LEN4);
-AV_CHECK_OFFSET(CeltIMDCTContext, tmp,            CELT_TMP);
-AV_CHECK_OFFSET(CeltIMDCTContext, twiddle_exptab, CELT_TWIDDLE);
-
-void ff_celt_imdct_half_neon(CeltIMDCTContext *s, float *dst, const float *src,
-                             ptrdiff_t stride, float scale);
-
-void ff_celt_imdct_init_aarch64(CeltIMDCTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        s->imdct_half = ff_celt_imdct_half_neon;
-    }
-}
diff --git a/libavcodec/aarch64/opus_imdct_neon.S b/libavcodec/aarch64/opus_imdct_neon.S
deleted file mode 100644
index d99edf4108..0000000000
--- a/libavcodec/aarch64/opus_imdct_neon.S
+++ /dev/null
@@ -1,647 +0,0 @@
-/*
- * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/aarch64/asm.S"
-
-#include "asm-offsets.h"
-
-.macro shuffle a, b, c, d
-const shuffle_\a\b\c\d, align=4
-        .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
-        .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
-        .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
-        .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
-endconst
-.endm
-
-shuffle 0, 2, 1, 3
-shuffle 1, 0, 3, 2
-shuffle 2, 3, 0, 1
-shuffle 3, 1, 2, 0
-
-
-function fft5_neon
-        lsl             x2,  x2,  #3
-        ld1             {v24.2s},         [x1],  x2
-        ld2             {v25.s,v26.s}[0], [x1],  x2
-        ld2             {v25.s,v26.s}[1], [x1],  x2
-        ld2             {v25.s,v26.s}[2], [x1],  x2
-        ld2             {v25.s,v26.s}[3], [x1]
-        dup             v6.4s,  v24.s[0]
-        dup             v7.4s,  v24.s[1]
-
-        faddp           v0.4s,  v25.4s, v26.4s
-        // z[][0], z[][3]
-        fmul            v16.4s, v25.4s, v15.s[0] // rr
-        fmul            v17.4s, v25.4s, v15.s[1] // ri
-        fmul            v18.4s, v26.4s, v15.s[0] // ir
-        fmul            v19.4s, v26.4s, v15.s[1] // ii
-        faddp           v0.4s,  v0.4s,  v0.4s
-        // z[][1], z[][2]
-        fmul            v20.4s, v25.4s, v15.s[2] // rr
-        fmul            v21.4s, v25.4s, v15.s[3] // ri
-        fmul            v22.4s, v26.4s, v15.s[2] // ir
-        fmul            v23.4s, v26.4s, v15.s[3] // ii
-        fadd            v0.2s,  v24.2s, v0.2s   // out[0]
-
-        // z[0123][0], z[0123][3]
-        fsub            v24.4s, v16.4s, v19.4s  //    (c).re =  rr - ii;
-        fadd            v27.4s, v16.4s, v19.4s  //    (d).re =  rr + ii;
-        ld1             {v16.16b},  [x11]
-        ld1             {v19.16b},  [x14]
-        fadd            v28.4s, v17.4s, v18.4s  //    (c).im =  ri + ir;
-        fsub            v31.4s, v18.4s, v17.4s  //    (d).im = -ri + ir;
-        ld1             {v17.16b},  [x12]
-        // z[0123][1], z[0123][2]
-        fsub            v25.4s, v20.4s, v23.4s  //    (c).re =  rr - ii;
-        fadd            v26.4s, v20.4s, v23.4s  //    (d).re =  rr + ii;
-        ld1             {v18.16b},  [x13]
-        fadd            v29.4s, v21.4s, v22.4s  //    (c).im =  ri + ir;
-        fsub            v30.4s, v22.4s, v21.4s  //    (d).im = -ri + ir;
-
-        //real
-        tbl             v20.16b, {v24.16b}, v16.16b
-        tbl             v21.16b, {v25.16b}, v17.16b
-        tbl             v22.16b, {v26.16b}, v18.16b
-        tbl             v23.16b, {v27.16b}, v19.16b
-        //imag
-        tbl             v16.16b, {v28.16b}, v16.16b
-        tbl             v17.16b, {v29.16b}, v17.16b
-        tbl             v18.16b, {v30.16b}, v18.16b
-        tbl             v19.16b, {v31.16b}, v19.16b
-
-        fadd            v6.4s,  v6.4s,  v20.4s
-        fadd            v22.4s, v22.4s, v23.4s
-        fadd            v7.4s,  v7.4s,  v16.4s
-        fadd            v18.4s, v18.4s, v19.4s
-
-        fadd            v21.4s, v21.4s, v22.4s
-        fadd            v17.4s, v17.4s, v18.4s
-        fadd            v6.4s,  v6.4s,  v21.4s
-        fadd            v7.4s,  v7.4s,  v17.4s
-
-        ret
-endfunc
-
-function fft15_neon
-        mov             x8,  x1
-        mov             x9,  x30
-        add             x2,  x3,  x3,  lsl #1   // 3 * stride
-
-        add             x1,  x8,  x3,  lsl #3   // in + 1 * stride
-        bl              fft5_neon
-        mov             v1.8b,   v0.8b
-        mov             v2.16b,  v6.16b
-        mov             v3.16b,  v7.16b
-
-        add             x1,  x8,  x3,  lsl #4   // in + 2 * stride
-        add             x2,  x3,  x3,  lsl #1   // 3 * stride
-        bl              fft5_neon
-        zip1            v1.4s,   v1.4s,  v0.4s
-        mov             v4.16b,  v6.16b
-        mov             v5.16b,  v7.16b
-
-        mov             x1,  x8                 // in + 0 * stride
-        add             x2,  x3,  x3,  lsl #1   // 3 * stride
-        bl              fft5_neon
-
-        faddp           v20.4s, v1.4s,  v1.4s
-
-        ext             v18.16b, v8.16b,  v8.16b,  #4
-        ext             v19.16b, v9.16b,  v9.16b,  #4
-        mov             v16.16b, v6.16b
-        mov             v17.16b, v7.16b
-        fadd            v20.2s, v20.2s, v0.2s
-
-        uzp1            v18.4s, v18.4s, v10.4s  // exp[2,4,6,8].re
-        uzp1            v19.4s, v19.4s, v11.4s  // exp[2,4,6,8].im
-
-        st1             {v20.2s},  [x0], #8     // out[0]
-
-        fmla            v16.4s, v2.4s,  v8.4s
-        fmls            v16.4s, v3.4s,  v9.4s
-
-        fmla            v17.4s, v2.4s,  v9.4s
-        fmla            v17.4s, v3.4s,  v8.4s
-
-        fmla            v16.4s, v4.4s,  v18.4s
-        fmls            v16.4s, v5.4s,  v19.4s
-
-        fmla            v17.4s, v4.4s,  v19.4s
-        fmla            v17.4s, v5.4s,  v18.4s
-
-        zip1            v18.4s, v16.4s, v17.4s
-        zip2            v19.4s, v16.4s, v17.4s
-
-        rev64           v31.4s, v14.4s
-        trn1            v28.2d, v1.2d,  v1.2d
-        trn2            v29.2d, v1.2d,  v1.2d
-        zip1            v30.2d, v14.2d, v31.2d
-        zip2            v31.2d, v14.2d, v31.2d
-
-        st1             {v18.4s,v19.4s},  [x0], #32 // out[1-4]
-
-        fmul            v16.4s, v28.4s, v30.4s
-        fmul            v17.4s, v29.4s, v30.4s
-        fmls            v16.4s, v29.4s, v31.4s
-        fmla            v17.4s, v28.4s, v31.4s
-        faddp           v16.4s, v16.4s, v16.4s
-        faddp           v17.4s, v17.4s, v17.4s
-        zip1            v18.2s, v16.2s, v17.2s
-        zip2            v19.2s, v16.2s, v17.2s
-
-        fadd            v18.2s, v18.2s, v0.2s
-        fadd            v0.2s,  v19.2s, v0.2s
-
-        ext             v30.16b, v12.16b, v12.16b, #4
-        ext             v31.16b, v13.16b, v13.16b, #4
-        mov             v16.16b, v6.16b
-        mov             v17.16b, v7.16b
-
-        uzp1            v30.4s, v30.4s, v8.4s
-        uzp1            v31.4s, v31.4s, v9.4s
-
-        st1             {v18.2s},  [x0], #8     // out[5]
-
-        fmla            v16.4s, v2.4s,  v10.4s
-        fmls            v16.4s, v3.4s,  v11.4s
-
-        fmla            v17.4s, v2.4s,  v11.4s
-        fmla            v17.4s, v3.4s,  v10.4s
-
-        fmla            v16.4s, v4.4s,  v30.4s
-        fmls            v16.4s, v5.4s,  v31.4s
-
-        fmla            v17.4s, v4.4s,  v31.4s
-        fmla            v17.4s, v5.4s,  v30.4s
-
-        zip1            v18.4s, v16.4s, v17.4s
-        zip2            v19.4s, v16.4s, v17.4s
-
-        ext             v30.16b, v10.16b, v10.16b, #4
-        ext             v31.16b, v11.16b, v11.16b, #4
-
-        fmla            v6.4s,  v2.4s,  v12.4s
-        fmls            v6.4s,  v3.4s,  v13.4s
-
-        st1             {v18.4s,v19.4s},  [x0], #32 // out[6-9]
-
-        uzp1            v30.4s, v30.4s, v12.4s
-        uzp1            v31.4s, v31.4s, v13.4s
-
-        fmla            v7.4s,  v2.4s,  v13.4s
-        fmla            v7.4s,  v3.4s,  v12.4s
-
-        st1             {v0.2s},  [x0], #8     // out[10]
-
-        fmla            v6.4s,  v4.4s,  v30.4s
-        fmls            v6.4s,  v5.4s,  v31.4s
-
-        fmla            v7.4s,  v4.4s,  v31.4s
-        fmla            v7.4s,  v5.4s,  v30.4s
-
-        zip1            v18.4s, v6.4s,  v7.4s
-        zip2            v19.4s, v6.4s,  v7.4s
-
-        st1             {v18.4s,v19.4s},  [x0], #32 // out[11-14]
-
-        ret             x9
-endfunc
-
-// x0: out, x1: out+len2, x2: exptab, x3: len2
-function fft15_pass
-        ands            x6,  x3,  #3
-        mov             x4,  x0
-        mov             x5,  x1
-        b.eq            9f
-        ld1             {v0.2s},  [x0], #8
-        ld1             {v1.2s},  [x1], #8
-        sub             x3,  x3,  x6
-        subs            x6,  x6,  #1
-        fadd            v2.2s,  v0.2s,  v1.2s
-        fsub            v3.2s,  v0.2s,  v1.2s
-        add             x2,  x2,  #8
-        st1             {v2.2s},  [x4], #8
-        st1             {v3.2s},  [x5], #8
-        b.eq            9f
-1:
-        subs            x6,  x6,  #1
-        ldp             s4,  s5,  [x2], #8
-        ldp             s2,  s3,  [x1], #8
-        ldp             s0,  s1,  [x0], #8
-
-        fmul            s6,  s2,  s4
-        fmul            s7,  s2,  s5
-        fmls            s6,  s3,  v5.s[0]
-        fmla            s7,  s3,  v4.s[0]
-
-        fsub            s2,  s0,  s6
-        fsub            s3,  s1,  s7
-        fadd            s0,  s0,  s6
-        fadd            s1,  s1,  s7
-
-        stp             s2,  s3,  [x5], #8
-        stp             s0,  s1,  [x4], #8
-        b.gt            1b
-9:
-        ld1             {v4.4s,v5.4s}, [x2],  #32
-        ld2             {v2.4s,v3.4s}, [x1],  #32
-        uzp1            v6.4s,  v4.4s,  v5.4s
-        uzp2            v7.4s,  v4.4s,  v5.4s
-        ld2             {v0.4s,v1.4s}, [x0],  #32
-8:
-        subs            x3,  x3,  #8
-
-        fmul            v4.4s,  v2.4s,  v6.4s
-        fmul            v5.4s,  v2.4s,  v7.4s
-        b.lt            4f
-
-        ld1             {v18.4s,v19.4s}, [x2],  #32
-
-        fmls            v4.4s,  v3.4s,  v7.4s
-        fmla            v5.4s,  v3.4s,  v6.4s
-
-        ld2             {v22.4s,v23.4s}, [x1],  #32
-
-        fsub            v2.4s,  v0.4s,  v4.4s
-        fadd            v0.4s,  v0.4s,  v4.4s
-        fsub            v3.4s,  v1.4s,  v5.4s
-        fadd            v1.4s,  v1.4s,  v5.4s
-
-        uzp1            v16.4s, v18.4s, v19.4s
-        uzp2            v17.4s, v18.4s, v19.4s
-
-        st2             {v2.4s,v3.4s}, [x5],  #32
-        st2             {v0.4s,v1.4s}, [x4],  #32
-        ld2             {v20.4s,v21.4s}, [x0],  #32
-
-        fmul            v18.4s, v22.4s, v16.4s
-        fmul            v19.4s, v22.4s, v17.4s
-        b.eq            0f
-
-        ld1             {v4.4s,v5.4s}, [x2],  #32
-
-        fmls            v18.4s, v23.4s, v17.4s
-        fmla            v19.4s, v23.4s, v16.4s
-
-        ld2             {v2.4s,v3.4s}, [x1],  #32
-
-        fsub            v22.4s, v20.4s, v18.4s
-        fadd            v20.4s, v20.4s, v18.4s
-        fsub            v23.4s, v21.4s, v19.4s
-        fadd            v21.4s, v21.4s, v19.4s
-
-        uzp1            v6.4s,  v4.4s,  v5.4s
-        uzp2            v7.4s,  v4.4s,  v5.4s
-
-        st2             {v22.4s,v23.4s}, [x5],  #32
-        st2             {v20.4s,v21.4s}, [x4],  #32
-        ld2             {v0.4s,v1.4s}, [x0],  #32
-
-        b               8b
-4:
-        fmls            v4.4s,  v3.4s,  v7.4s
-        fmla            v5.4s,  v3.4s,  v6.4s
-
-        fsub            v2.4s,  v0.4s,  v4.4s
-        fadd            v0.4s,  v0.4s,  v4.4s
-        fsub            v3.4s,  v1.4s,  v5.4s
-        fadd            v1.4s,  v1.4s,  v5.4s
-
-        st2             {v2.4s,v3.4s}, [x5],  #32
-        st2             {v0.4s,v1.4s}, [x4],  #32
-
-        ret
-0:
-        fmls            v18.4s, v23.4s, v17.4s
-        fmla            v19.4s, v23.4s, v16.4s
-
-        fsub            v22.4s, v20.4s, v18.4s
-        fadd            v20.4s, v20.4s, v18.4s
-        fsub            v23.4s, v21.4s, v19.4s
-        fadd            v21.4s, v21.4s, v19.4s
-
-        st2             {v22.4s,v23.4s}, [x5],  #32
-        st2             {v20.4s,v21.4s}, [x4],  #32
-
-        ret
-endfunc
-
-function fft30_neon, align=6
-        sub             sp,  sp,  #0x20
-        stp             x20, x21, [sp]
-        stp             x22, x30, [sp, #0x10]
-        mov             x21, x1
-        mov             x22, x2
-        mov             x20, x4
-        mov             x0,  x21
-        mov             x1,  x22
-        lsl             x3,  x20, #1
-        bl              fft15_neon
-
-        add             x0,  x21, #15*8
-        add             x1,  x22, x20,  lsl #3
-        lsl             x3,  x20, #1
-        bl              fft15_neon
-
-        ldr             x2,  [x10, #(CELT_EXPTAB + 8)]  // s->exptab[1]
-        add             x0,  x21, #0
-        add             x1,  x21, #15*8
-        mov             x3,  #15
-        ldp             x20, x21, [sp]
-        ldp             x22, x30, [sp, #0x10]
-        add             sp,  sp,  #0x20
-        b               fft15_pass
-endfunc
-
-.macro  def_fft n, n2
-function fft\n\()_neon, align=6
-        sub             sp,  sp,  #0x30
-        stp             x20, x21, [sp]
-        stp             x22, x30, [sp, #0x10]
-        stp             x23, x24, [sp, #0x20]
-        mov             x21, x1
-        mov             x22, x2
-        mov             x23, x3
-        mov             x20, x4
-        sub             x3,  x3,  #1
-        lsl             x4,  x4,  #1
-        bl              fft\n2\()_neon
-
-        add             x1,  x21, #(\n2 * 8)
-        add             x2,  x22, x20, lsl #3
-        sub             x3,  x23, #1
-        lsl             x4,  x20, #1
-        bl              fft\n2\()_neon
-
-        add             x5,  x10, #CELT_EXPTAB
-        mov             x0,  x21
-        ldr             x2,  [x5,  x23, lsl #3] // s->exptab[N]
-        add             x1,  x21, #(\n2 * 8)
-        mov             x3,  #\n2
-        ldp             x20, x21, [sp]
-        ldp             x22, x30, [sp, #0x10]
-        ldp             x23, x24, [sp, #0x20]
-        add             sp,  sp,  #0x30
-        b               fft15_pass
-endfunc
-.endm
-
-        def_fft    60,  30
-        def_fft   120,  60
-        def_fft   240, 120
-        def_fft   480, 240
-        def_fft   960, 480
-
-function fft_b15_calc_neon
-        sub             sp,  sp,  #0x50
-        ldr             x8,  [x0,  #CELT_EXPTAB]    // s->exptab[0]
-        movrel          x6,  fact5
-        movrel          x11, shuffle_0213
-        movrel          x12, shuffle_1032
-        movrel          x13, shuffle_2301
-        movrel          x14, shuffle_3120
-        add             x8,  x8,  #8
-        movrel          x5,  fft_tab_neon
-        stp             x20, x30, [sp]
-        stp             d8,  d9,  [sp, #0x10]
-        stp             d10, d11, [sp, #0x20]
-        stp             d12, d13, [sp, #0x30]
-        stp             d14, d15, [sp, #0x40]
-        ld1             {v15.4s}, [x6]
-        ld1             {v0.4s,v1.4s},   [x8],  #32
-        ld1             {v6.2s},  [x8],  #8
-        ld1             {v2.4s,v3.4s},   [x8],  #32
-        ld1             {v7.2s},  [x8],  #8
-        ld1             {v4.4s,v5.4s},   [x8],  #32
-        uzp1            v8.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].re
-        uzp2            v9.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].im
-        uzp1            v10.4s, v2.4s,  v3.4s   // exp[ 6 -  9].re
-        uzp2            v11.4s, v2.4s,  v3.4s   // exp[ 6 -  9].im
-        uzp1            v12.4s, v4.4s,  v5.4s   // exp[11 - 14].re
-        uzp2            v13.4s, v4.4s,  v5.4s   // exp[11 - 14].im
-        zip1            v14.4s, v6.4s,  v7.4s   // exp[5,10].re/exp[5,10].im
-        add             x5,  x5,  x3,  lsl #3
-        ldr             x5,  [x5]
-        mov             x10, x0
-        blr             x5
-        ldp             x20, x30, [sp]
-        ldp             d8,  d9,  [sp, #0x10]
-        ldp             d10, d11, [sp, #0x20]
-        ldp             d12, d13, [sp, #0x30]
-        ldp             d14, d15, [sp, #0x40]
-        add             sp,  sp,  #0x50
-        ret
-endfunc
-
-const   fft_tab_neon, relocate=1
-        .quad fft15_neon
-        .quad fft30_neon
-        .quad fft60_neon
-        .quad fft120_neon
-        .quad fft240_neon
-        .quad fft480_neon
-        .quad fft960_neon
-endconst
-
-function ff_celt_imdct_half_neon, export=1
-        sub             sp,  sp,  #0x20
-        stp             x21, x30, [sp]
-        str             s0, [sp, #0x10]
-
-        ldp             w5,  w6,  [x0,  #CELT_LEN2] // CELT_LEN4
-        mov             x10, x0
-        mov             x21, x1
-        sub             w5,  w5,  #1
-        lsl             x7,  x3,  #3            //  2 * stride * sizeof(float)
-        sub             x8,  xzr, x3,  lsl #3   // -2 * stride * sizeof(float)
-        mul             x5,  x5,  x3
-        ldp             x9,  x10, [x0,  #CELT_TMP]  // CELT_TWIDDLE
-        ldr             w3,  [x0, #CELT_FFT_N]
-        add             x5,  x2,  x5,  lsl #2
-        mov             x11, x9
-
-        sub             w6,  w6,  #4
-        ld1             {v0.s}[0],  [x5], x8
-        ld1             {v1.s}[0],  [x2], x7
-        ld1             {v4.4s,v5.4s}, [x10], #32
-        ld1             {v0.s}[1],  [x5], x8
-        ld1             {v1.s}[1],  [x2], x7
-        uzp1            v2.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[2],  [x5], x8
-        ld1             {v1.s}[2],  [x2], x7
-        uzp2            v3.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[3],  [x5], x8
-        ld1             {v1.s}[3],  [x2], x7
-1:
-        subs            w6,  w6,  #4
-
-        ld1             {v20.s}[0], [x5], x8
-        ld1             {v21.s}[0], [x2], x7
-        ld1             {v4.4s,v5.4s}, [x10], #32
-
-        fmul            v6.4s,  v0.4s,  v2.4s
-        fmul            v7.4s,  v0.4s,  v3.4s
-
-        ld1             {v20.s}[1], [x5], x8
-        ld1             {v21.s}[1], [x2], x7
-
-        fmls            v6.4s,  v1.4s,  v3.4s
-        fmla            v7.4s,  v1.4s,  v2.4s
-
-        ld1             {v20.s}[2], [x5], x8
-        ld1             {v21.s}[2], [x2], x7
-
-        uzp1            v2.4s,  v4.4s,  v5.4s
-        uzp2            v3.4s,  v4.4s,  v5.4s
-        ld1             {v20.s}[3], [x5], x8
-        ld1             {v21.s}[3], [x2], x7
-
-        zip1            v4.4s,  v6.4s,  v7.4s
-        zip2            v5.4s,  v6.4s,  v7.4s
-
-        fmul            v6.4s,  v20.4s, v2.4s
-        fmul            v7.4s,  v20.4s, v3.4s
-
-        st1             {v4.4s,v5.4s}, [x9], #32
-
-        fmls            v6.4s,  v21.4s, v3.4s
-        fmla            v7.4s,  v21.4s, v2.4s
-
-        b.eq            3f
-
-        subs            w6,  w6,  #4
-        ld1             {v4.4s,v5.4s}, [x10], #32
-        ld1             {v0.s}[0],  [x5], x8
-        ld1             {v1.s}[0],  [x2], x7
-        uzp1            v2.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[1],  [x5], x8
-        ld1             {v1.s}[1],  [x2], x7
-        uzp2            v3.4s,  v4.4s,  v5.4s
-        ld1             {v0.s}[2],  [x5], x8
-        ld1             {v1.s}[2],  [x2], x7
-        zip1            v4.4s,  v6.4s,  v7.4s
-        zip2            v5.4s,  v6.4s,  v7.4s
-        ld1             {v0.s}[3],  [x5], x8
-        ld1             {v1.s}[3],  [x2], x7
-
-        st1             {v4.4s,v5.4s}, [x9], #32
-
-        b.gt            1b
-
-        fmul            v6.4s,  v0.4s,  v2.4s
-        fmul            v7.4s,  v0.4s,  v3.4s
-        fmls            v6.4s,  v1.4s,  v3.4s
-        fmla            v7.4s,  v1.4s,  v2.4s
-3:
-        zip1            v4.4s,  v6.4s,  v7.4s
-        zip2            v5.4s,  v6.4s,  v7.4s
-        st1             {v4.4s,v5.4s}, [x9], #32
-
-        mov             x2,  x11
-        mov             x4,  #1
-
-        bl              fft_b15_calc_neon
-
-        ldr             w5,  [x10, #CELT_LEN4]
-        ldr             x6,  [x10, #CELT_TWIDDLE]
-        ldr             s31, [sp, #0x10]
-
-        add             x1,  x21, x5,  lsl #2
-        add             x3,  x6,  x5,  lsl #2
-        sub             x0,  x1,  #16
-        sub             x2,  x3,  #16
-        mov             x8,  #-16
-        mov             x7,  #16
-        mov             x10, x0
-        mov             x11, x1
-
-        sub             w5,  w5,  #4
-
-        ld1             {v0.4s},  [x0], x8
-        ld1             {v1.4s},  [x1], x7
-        ld1             {v2.4s},  [x2], x8
-        ld1             {v3.4s},  [x3], x7
-
-        uzp1            v4.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].re
-        uzp2            v6.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].im
-
-        uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
-        uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
-
-        fmul            v1.4s,  v6.4s,  v5.4s
-        fmul            v0.4s,  v6.4s,  v7.4s
-2:
-        subs            w5,  w5,  #4
-
-        ld1             {v20.4s}, [x0], x8
-
-        fmla            v1.4s,  v4.4s,  v7.4s
-        fmls            v0.4s,  v4.4s,  v5.4s
-
-        ld1             {v21.4s}, [x1], x7
-
-        ext             v1.16b, v1.16b, v1.16b, #8
-        fmul            v0.4s,  v0.4s,  v31.s[0]
-
-        ld1             {v2.4s},  [x2], x8
-
-        rev64           v1.4s,  v1.4s
-        fmul            v1.4s,  v1.4s,  v31.s[0]
-
-        ld1             {v3.4s},  [x3], x7
-
-        zip1            v5.4s,  v0.4s,  v1.4s
-        zip2            v7.4s,  v0.4s,  v1.4s
-
-        uzp1            v4.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].re
-        uzp2            v6.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].im
-
-        st1             {v5.4s},  [x10], x8
-        st1             {v7.4s},  [x11], x7
-
-        uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
-        uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
-
-        fmul            v1.4s,  v6.4s,  v5.4s
-        fmul            v0.4s,  v6.4s,  v7.4s
-        b.gt            2b
-
-        fmla            v1.4s,  v4.4s,  v7.4s
-        fmls            v0.4s,  v4.4s,  v5.4s
-        ext             v1.16b, v1.16b, v1.16b, #8
-        fmul            v0.4s,  v0.4s,  v31.s[0]
-        rev64           v1.4s,  v1.4s
-        fmul            v1.4s,  v1.4s,  v31.s[0]
-        zip1            v5.4s,  v0.4s,  v1.4s
-        zip2            v7.4s,  v0.4s,  v1.4s
-        st1             {v5.4s},  [x10], x8
-        st1             {v7.4s},  [x11], x7
-
-        ldp             x21, x30, [sp]
-        add             sp,  sp,  #0x20
-        ret
-endfunc
-
-// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
-const   fact5,          align=4
-        .float           0.30901699437494745, 0.95105651629515353
-        .float          -0.80901699437494734, 0.58778525229247325
-endconst
-- 
cgit v1.2.3