From 3d5d46233cd81f78138a6d7418d480af04d3f6c8 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Sun, 1 Feb 2015 11:08:17 +0100 Subject: opus: Factor out imdct15 into a standalone component It will be reused by the AAC decoder. --- libavcodec/aarch64/Makefile | 4 +- libavcodec/aarch64/imdct15_init.c | 46 +++ libavcodec/aarch64/imdct15_neon.S | 647 +++++++++++++++++++++++++++++++++++ libavcodec/aarch64/opus_imdct_init.c | 45 --- libavcodec/aarch64/opus_imdct_neon.S | 647 ----------------------------------- 5 files changed, 695 insertions(+), 694 deletions(-) create mode 100644 libavcodec/aarch64/imdct15_init.c create mode 100644 libavcodec/aarch64/imdct15_neon.S delete mode 100644 libavcodec/aarch64/opus_imdct_init.c delete mode 100644 libavcodec/aarch64/opus_imdct_neon.S (limited to 'libavcodec/aarch64') diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 964428e35d..2afff297dc 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -3,11 +3,11 @@ OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o +OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_init.o OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o -OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opus_imdct_init.o OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o @@ -21,8 +21,8 @@ NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ aarch64/hpeldsp_neon.o NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o +NEON-OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_neon.o NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o -NEON-OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opus_imdct_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o diff --git a/libavcodec/aarch64/imdct15_init.c b/libavcodec/aarch64/imdct15_init.c new file mode 100644 index 0000000000..38018f2b4a --- /dev/null +++ b/libavcodec/aarch64/imdct15_init.c @@ -0,0 +1,46 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" +#include "libavutil/internal.h" + +#include "libavcodec/imdct15.h" + +#include "asm-offsets.h" + +AV_CHECK_OFFSET(IMDCT15Context, exptab, CELT_EXPTAB); +AV_CHECK_OFFSET(IMDCT15Context, fft_n, CELT_FFT_N); +AV_CHECK_OFFSET(IMDCT15Context, len2, CELT_LEN2); +AV_CHECK_OFFSET(IMDCT15Context, len4, CELT_LEN4); +AV_CHECK_OFFSET(IMDCT15Context, tmp, CELT_TMP); +AV_CHECK_OFFSET(IMDCT15Context, twiddle_exptab, CELT_TWIDDLE); + +void ff_celt_imdct_half_neon(IMDCT15Context *s, float *dst, const float *src, + ptrdiff_t stride, float scale); + +void ff_imdct15_init_aarch64(IMDCT15Context *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + s->imdct_half = ff_celt_imdct_half_neon; + } +} diff --git a/libavcodec/aarch64/imdct15_neon.S b/libavcodec/aarch64/imdct15_neon.S new file mode 100644 index 0000000000..d99edf4108 --- /dev/null +++ b/libavcodec/aarch64/imdct15_neon.S @@ -0,0 +1,647 @@ +/* + * Copyright (c) 2014 Janne Grunau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +#include "asm-offsets.h" + +.macro shuffle a, b, c, d +const shuffle_\a\b\c\d, align=4 + .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3) + .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3) + .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3) + .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3) +endconst +.endm + +shuffle 0, 2, 1, 3 +shuffle 1, 0, 3, 2 +shuffle 2, 3, 0, 1 +shuffle 3, 1, 2, 0 + + +function fft5_neon + lsl x2, x2, #3 + ld1 {v24.2s}, [x1], x2 + ld2 {v25.s,v26.s}[0], [x1], x2 + ld2 {v25.s,v26.s}[1], [x1], x2 + ld2 {v25.s,v26.s}[2], [x1], x2 + ld2 {v25.s,v26.s}[3], [x1] + dup v6.4s, v24.s[0] + dup v7.4s, v24.s[1] + + faddp v0.4s, v25.4s, v26.4s + // z[][0], z[][3] + fmul v16.4s, v25.4s, v15.s[0] // rr + fmul v17.4s, v25.4s, v15.s[1] // ri + fmul v18.4s, v26.4s, v15.s[0] // ir + fmul v19.4s, v26.4s, v15.s[1] // ii + faddp v0.4s, v0.4s, v0.4s + // z[][1], z[][2] + fmul v20.4s, v25.4s, v15.s[2] // rr + fmul v21.4s, v25.4s, v15.s[3] // ri + fmul v22.4s, v26.4s, v15.s[2] // ir + fmul v23.4s, v26.4s, v15.s[3] // ii + fadd v0.2s, v24.2s, v0.2s // out[0] + + // z[0123][0], z[0123][3] + fsub v24.4s, v16.4s, v19.4s // (c).re = rr - ii; + fadd v27.4s, v16.4s, v19.4s // (d).re = rr + ii; + ld1 {v16.16b}, [x11] + ld1 {v19.16b}, [x14] + fadd v28.4s, v17.4s, v18.4s // (c).im = ri + ir; + fsub v31.4s, v18.4s, v17.4s // (d).im = -ri + ir; + ld1 {v17.16b}, [x12] + // z[0123][1], z[0123][2] + fsub v25.4s, v20.4s, v23.4s // (c).re = rr - ii; + fadd v26.4s, v20.4s, v23.4s // (d).re = rr + ii; + ld1 {v18.16b}, [x13] + fadd v29.4s, v21.4s, v22.4s // (c).im = ri + ir; + fsub v30.4s, v22.4s, v21.4s // (d).im = -ri + ir; + + //real + tbl v20.16b, {v24.16b}, v16.16b + tbl v21.16b, {v25.16b}, v17.16b + tbl v22.16b, {v26.16b}, v18.16b + tbl v23.16b, {v27.16b}, v19.16b + //imag + tbl v16.16b, {v28.16b}, v16.16b + tbl v17.16b, {v29.16b}, v17.16b + tbl v18.16b, {v30.16b}, v18.16b + tbl v19.16b, {v31.16b}, v19.16b + + fadd v6.4s, v6.4s, v20.4s + fadd v22.4s, v22.4s, v23.4s + fadd v7.4s, v7.4s, v16.4s + fadd v18.4s, v18.4s, v19.4s + + fadd v21.4s, v21.4s, v22.4s + fadd v17.4s, v17.4s, v18.4s + fadd v6.4s, v6.4s, v21.4s + fadd v7.4s, v7.4s, v17.4s + + ret +endfunc + +function fft15_neon + mov x8, x1 + mov x9, x30 + add x2, x3, x3, lsl #1 // 3 * stride + + add x1, x8, x3, lsl #3 // in + 1 * stride + bl fft5_neon + mov v1.8b, v0.8b + mov v2.16b, v6.16b + mov v3.16b, v7.16b + + add x1, x8, x3, lsl #4 // in + 2 * stride + add x2, x3, x3, lsl #1 // 3 * stride + bl fft5_neon + zip1 v1.4s, v1.4s, v0.4s + mov v4.16b, v6.16b + mov v5.16b, v7.16b + + mov x1, x8 // in + 0 * stride + add x2, x3, x3, lsl #1 // 3 * stride + bl fft5_neon + + faddp v20.4s, v1.4s, v1.4s + + ext v18.16b, v8.16b, v8.16b, #4 + ext v19.16b, v9.16b, v9.16b, #4 + mov v16.16b, v6.16b + mov v17.16b, v7.16b + fadd v20.2s, v20.2s, v0.2s + + uzp1 v18.4s, v18.4s, v10.4s // exp[2,4,6,8].re + uzp1 v19.4s, v19.4s, v11.4s // exp[2,4,6,8].im + + st1 {v20.2s}, [x0], #8 // out[0] + + fmla v16.4s, v2.4s, v8.4s + fmls v16.4s, v3.4s, v9.4s + + fmla v17.4s, v2.4s, v9.4s + fmla v17.4s, v3.4s, v8.4s + + fmla v16.4s, v4.4s, v18.4s + fmls v16.4s, v5.4s, v19.4s + + fmla v17.4s, v4.4s, v19.4s + fmla v17.4s, v5.4s, v18.4s + + zip1 v18.4s, v16.4s, v17.4s + zip2 v19.4s, v16.4s, v17.4s + + rev64 v31.4s, v14.4s + trn1 v28.2d, v1.2d, v1.2d + trn2 v29.2d, v1.2d, v1.2d + zip1 v30.2d, v14.2d, v31.2d + zip2 v31.2d, v14.2d, v31.2d + + st1 {v18.4s,v19.4s}, [x0], #32 // out[1-4] + + fmul v16.4s, v28.4s, v30.4s + fmul v17.4s, v29.4s, v30.4s + fmls v16.4s, v29.4s, v31.4s + fmla v17.4s, v28.4s, v31.4s + faddp v16.4s, v16.4s, v16.4s + faddp v17.4s, v17.4s, v17.4s + zip1 v18.2s, v16.2s, v17.2s + zip2 v19.2s, v16.2s, v17.2s + + fadd v18.2s, v18.2s, v0.2s + fadd v0.2s, v19.2s, v0.2s + + ext v30.16b, v12.16b, v12.16b, #4 + ext v31.16b, v13.16b, v13.16b, #4 + mov v16.16b, v6.16b + mov v17.16b, v7.16b + + uzp1 v30.4s, v30.4s, v8.4s + uzp1 v31.4s, v31.4s, v9.4s + + st1 {v18.2s}, [x0], #8 // out[5] + + fmla v16.4s, v2.4s, v10.4s + fmls v16.4s, v3.4s, v11.4s + + fmla v17.4s, v2.4s, v11.4s + fmla v17.4s, v3.4s, v10.4s + + fmla v16.4s, v4.4s, v30.4s + fmls v16.4s, v5.4s, v31.4s + + fmla v17.4s, v4.4s, v31.4s + fmla v17.4s, v5.4s, v30.4s + + zip1 v18.4s, v16.4s, v17.4s + zip2 v19.4s, v16.4s, v17.4s + + ext v30.16b, v10.16b, v10.16b, #4 + ext v31.16b, v11.16b, v11.16b, #4 + + fmla v6.4s, v2.4s, v12.4s + fmls v6.4s, v3.4s, v13.4s + + st1 {v18.4s,v19.4s}, [x0], #32 // out[6-9] + + uzp1 v30.4s, v30.4s, v12.4s + uzp1 v31.4s, v31.4s, v13.4s + + fmla v7.4s, v2.4s, v13.4s + fmla v7.4s, v3.4s, v12.4s + + st1 {v0.2s}, [x0], #8 // out[10] + + fmla v6.4s, v4.4s, v30.4s + fmls v6.4s, v5.4s, v31.4s + + fmla v7.4s, v4.4s, v31.4s + fmla v7.4s, v5.4s, v30.4s + + zip1 v18.4s, v6.4s, v7.4s + zip2 v19.4s, v6.4s, v7.4s + + st1 {v18.4s,v19.4s}, [x0], #32 // out[11-14] + + ret x9 +endfunc + +// x0: out, x1: out+len2, x2: exptab, x3: len2 +function fft15_pass + ands x6, x3, #3 + mov x4, x0 + mov x5, x1 + b.eq 9f + ld1 {v0.2s}, [x0], #8 + ld1 {v1.2s}, [x1], #8 + sub x3, x3, x6 + subs x6, x6, #1 + fadd v2.2s, v0.2s, v1.2s + fsub v3.2s, v0.2s, v1.2s + add x2, x2, #8 + st1 {v2.2s}, [x4], #8 + st1 {v3.2s}, [x5], #8 + b.eq 9f +1: + subs x6, x6, #1 + ldp s4, s5, [x2], #8 + ldp s2, s3, [x1], #8 + ldp s0, s1, [x0], #8 + + fmul s6, s2, s4 + fmul s7, s2, s5 + fmls s6, s3, v5.s[0] + fmla s7, s3, v4.s[0] + + fsub s2, s0, s6 + fsub s3, s1, s7 + fadd s0, s0, s6 + fadd s1, s1, s7 + + stp s2, s3, [x5], #8 + stp s0, s1, [x4], #8 + b.gt 1b +9: + ld1 {v4.4s,v5.4s}, [x2], #32 + ld2 {v2.4s,v3.4s}, [x1], #32 + uzp1 v6.4s, v4.4s, v5.4s + uzp2 v7.4s, v4.4s, v5.4s + ld2 {v0.4s,v1.4s}, [x0], #32 +8: + subs x3, x3, #8 + + fmul v4.4s, v2.4s, v6.4s + fmul v5.4s, v2.4s, v7.4s + b.lt 4f + + ld1 {v18.4s,v19.4s}, [x2], #32 + + fmls v4.4s, v3.4s, v7.4s + fmla v5.4s, v3.4s, v6.4s + + ld2 {v22.4s,v23.4s}, [x1], #32 + + fsub v2.4s, v0.4s, v4.4s + fadd v0.4s, v0.4s, v4.4s + fsub v3.4s, v1.4s, v5.4s + fadd v1.4s, v1.4s, v5.4s + + uzp1 v16.4s, v18.4s, v19.4s + uzp2 v17.4s, v18.4s, v19.4s + + st2 {v2.4s,v3.4s}, [x5], #32 + st2 {v0.4s,v1.4s}, [x4], #32 + ld2 {v20.4s,v21.4s}, [x0], #32 + + fmul v18.4s, v22.4s, v16.4s + fmul v19.4s, v22.4s, v17.4s + b.eq 0f + + ld1 {v4.4s,v5.4s}, [x2], #32 + + fmls v18.4s, v23.4s, v17.4s + fmla v19.4s, v23.4s, v16.4s + + ld2 {v2.4s,v3.4s}, [x1], #32 + + fsub v22.4s, v20.4s, v18.4s + fadd v20.4s, v20.4s, v18.4s + fsub v23.4s, v21.4s, v19.4s + fadd v21.4s, v21.4s, v19.4s + + uzp1 v6.4s, v4.4s, v5.4s + uzp2 v7.4s, v4.4s, v5.4s + + st2 {v22.4s,v23.4s}, [x5], #32 + st2 {v20.4s,v21.4s}, [x4], #32 + ld2 {v0.4s,v1.4s}, [x0], #32 + + b 8b +4: + fmls v4.4s, v3.4s, v7.4s + fmla v5.4s, v3.4s, v6.4s + + fsub v2.4s, v0.4s, v4.4s + fadd v0.4s, v0.4s, v4.4s + fsub v3.4s, v1.4s, v5.4s + fadd v1.4s, v1.4s, v5.4s + + st2 {v2.4s,v3.4s}, [x5], #32 + st2 {v0.4s,v1.4s}, [x4], #32 + + ret +0: + fmls v18.4s, v23.4s, v17.4s + fmla v19.4s, v23.4s, v16.4s + + fsub v22.4s, v20.4s, v18.4s + fadd v20.4s, v20.4s, v18.4s + fsub v23.4s, v21.4s, v19.4s + fadd v21.4s, v21.4s, v19.4s + + st2 {v22.4s,v23.4s}, [x5], #32 + st2 {v20.4s,v21.4s}, [x4], #32 + + ret +endfunc + +function fft30_neon, align=6 + sub sp, sp, #0x20 + stp x20, x21, [sp] + stp x22, x30, [sp, #0x10] + mov x21, x1 + mov x22, x2 + mov x20, x4 + mov x0, x21 + mov x1, x22 + lsl x3, x20, #1 + bl fft15_neon + + add x0, x21, #15*8 + add x1, x22, x20, lsl #3 + lsl x3, x20, #1 + bl fft15_neon + + ldr x2, [x10, #(CELT_EXPTAB + 8)] // s->exptab[1] + add x0, x21, #0 + add x1, x21, #15*8 + mov x3, #15 + ldp x20, x21, [sp] + ldp x22, x30, [sp, #0x10] + add sp, sp, #0x20 + b fft15_pass +endfunc + +.macro def_fft n, n2 +function fft\n\()_neon, align=6 + sub sp, sp, #0x30 + stp x20, x21, [sp] + stp x22, x30, [sp, #0x10] + stp x23, x24, [sp, #0x20] + mov x21, x1 + mov x22, x2 + mov x23, x3 + mov x20, x4 + sub x3, x3, #1 + lsl x4, x4, #1 + bl fft\n2\()_neon + + add x1, x21, #(\n2 * 8) + add x2, x22, x20, lsl #3 + sub x3, x23, #1 + lsl x4, x20, #1 + bl fft\n2\()_neon + + add x5, x10, #CELT_EXPTAB + mov x0, x21 + ldr x2, [x5, x23, lsl #3] // s->exptab[N] + add x1, x21, #(\n2 * 8) + mov x3, #\n2 + ldp x20, x21, [sp] + ldp x22, x30, [sp, #0x10] + ldp x23, x24, [sp, #0x20] + add sp, sp, #0x30 + b fft15_pass +endfunc +.endm + + def_fft 60, 30 + def_fft 120, 60 + def_fft 240, 120 + def_fft 480, 240 + def_fft 960, 480 + +function fft_b15_calc_neon + sub sp, sp, #0x50 + ldr x8, [x0, #CELT_EXPTAB] // s->exptab[0] + movrel x6, fact5 + movrel x11, shuffle_0213 + movrel x12, shuffle_1032 + movrel x13, shuffle_2301 + movrel x14, shuffle_3120 + add x8, x8, #8 + movrel x5, fft_tab_neon + stp x20, x30, [sp] + stp d8, d9, [sp, #0x10] + stp d10, d11, [sp, #0x20] + stp d12, d13, [sp, #0x30] + stp d14, d15, [sp, #0x40] + ld1 {v15.4s}, [x6] + ld1 {v0.4s,v1.4s}, [x8], #32 + ld1 {v6.2s}, [x8], #8 + ld1 {v2.4s,v3.4s}, [x8], #32 + ld1 {v7.2s}, [x8], #8 + ld1 {v4.4s,v5.4s}, [x8], #32 + uzp1 v8.4s, v0.4s, v1.4s // exp[ 1 - 4].re + uzp2 v9.4s, v0.4s, v1.4s // exp[ 1 - 4].im + uzp1 v10.4s, v2.4s, v3.4s // exp[ 6 - 9].re + uzp2 v11.4s, v2.4s, v3.4s // exp[ 6 - 9].im + uzp1 v12.4s, v4.4s, v5.4s // exp[11 - 14].re + uzp2 v13.4s, v4.4s, v5.4s // exp[11 - 14].im + zip1 v14.4s, v6.4s, v7.4s // exp[5,10].re/exp[5,10].im + add x5, x5, x3, lsl #3 + ldr x5, [x5] + mov x10, x0 + blr x5 + ldp x20, x30, [sp] + ldp d8, d9, [sp, #0x10] + ldp d10, d11, [sp, #0x20] + ldp d12, d13, [sp, #0x30] + ldp d14, d15, [sp, #0x40] + add sp, sp, #0x50 + ret +endfunc + +const fft_tab_neon, relocate=1 + .quad fft15_neon + .quad fft30_neon + .quad fft60_neon + .quad fft120_neon + .quad fft240_neon + .quad fft480_neon + .quad fft960_neon +endconst + +function ff_celt_imdct_half_neon, export=1 + sub sp, sp, #0x20 + stp x21, x30, [sp] + str s0, [sp, #0x10] + + ldp w5, w6, [x0, #CELT_LEN2] // CELT_LEN4 + mov x10, x0 + mov x21, x1 + sub w5, w5, #1 + lsl x7, x3, #3 // 2 * stride * sizeof(float) + sub x8, xzr, x3, lsl #3 // -2 * stride * sizeof(float) + mul x5, x5, x3 + ldp x9, x10, [x0, #CELT_TMP] // CELT_TWIDDLE + ldr w3, [x0, #CELT_FFT_N] + add x5, x2, x5, lsl #2 + mov x11, x9 + + sub w6, w6, #4 + ld1 {v0.s}[0], [x5], x8 + ld1 {v1.s}[0], [x2], x7 + ld1 {v4.4s,v5.4s}, [x10], #32 + ld1 {v0.s}[1], [x5], x8 + ld1 {v1.s}[1], [x2], x7 + uzp1 v2.4s, v4.4s, v5.4s + ld1 {v0.s}[2], [x5], x8 + ld1 {v1.s}[2], [x2], x7 + uzp2 v3.4s, v4.4s, v5.4s + ld1 {v0.s}[3], [x5], x8 + ld1 {v1.s}[3], [x2], x7 +1: + subs w6, w6, #4 + + ld1 {v20.s}[0], [x5], x8 + ld1 {v21.s}[0], [x2], x7 + ld1 {v4.4s,v5.4s}, [x10], #32 + + fmul v6.4s, v0.4s, v2.4s + fmul v7.4s, v0.4s, v3.4s + + ld1 {v20.s}[1], [x5], x8 + ld1 {v21.s}[1], [x2], x7 + + fmls v6.4s, v1.4s, v3.4s + fmla v7.4s, v1.4s, v2.4s + + ld1 {v20.s}[2], [x5], x8 + ld1 {v21.s}[2], [x2], x7 + + uzp1 v2.4s, v4.4s, v5.4s + uzp2 v3.4s, v4.4s, v5.4s + ld1 {v20.s}[3], [x5], x8 + ld1 {v21.s}[3], [x2], x7 + + zip1 v4.4s, v6.4s, v7.4s + zip2 v5.4s, v6.4s, v7.4s + + fmul v6.4s, v20.4s, v2.4s + fmul v7.4s, v20.4s, v3.4s + + st1 {v4.4s,v5.4s}, [x9], #32 + + fmls v6.4s, v21.4s, v3.4s + fmla v7.4s, v21.4s, v2.4s + + b.eq 3f + + subs w6, w6, #4 + ld1 {v4.4s,v5.4s}, [x10], #32 + ld1 {v0.s}[0], [x5], x8 + ld1 {v1.s}[0], [x2], x7 + uzp1 v2.4s, v4.4s, v5.4s + ld1 {v0.s}[1], [x5], x8 + ld1 {v1.s}[1], [x2], x7 + uzp2 v3.4s, v4.4s, v5.4s + ld1 {v0.s}[2], [x5], x8 + ld1 {v1.s}[2], [x2], x7 + zip1 v4.4s, v6.4s, v7.4s + zip2 v5.4s, v6.4s, v7.4s + ld1 {v0.s}[3], [x5], x8 + ld1 {v1.s}[3], [x2], x7 + + st1 {v4.4s,v5.4s}, [x9], #32 + + b.gt 1b + + fmul v6.4s, v0.4s, v2.4s + fmul v7.4s, v0.4s, v3.4s + fmls v6.4s, v1.4s, v3.4s + fmla v7.4s, v1.4s, v2.4s +3: + zip1 v4.4s, v6.4s, v7.4s + zip2 v5.4s, v6.4s, v7.4s + st1 {v4.4s,v5.4s}, [x9], #32 + + mov x2, x11 + mov x4, #1 + + bl fft_b15_calc_neon + + ldr w5, [x10, #CELT_LEN4] + ldr x6, [x10, #CELT_TWIDDLE] + ldr s31, [sp, #0x10] + + add x1, x21, x5, lsl #2 + add x3, x6, x5, lsl #2 + sub x0, x1, #16 + sub x2, x3, #16 + mov x8, #-16 + mov x7, #16 + mov x10, x0 + mov x11, x1 + + sub w5, w5, #4 + + ld1 {v0.4s}, [x0], x8 + ld1 {v1.4s}, [x1], x7 + ld1 {v2.4s}, [x2], x8 + ld1 {v3.4s}, [x3], x7 + + uzp1 v4.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].re + uzp2 v6.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].im + + uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re + uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im + + fmul v1.4s, v6.4s, v5.4s + fmul v0.4s, v6.4s, v7.4s +2: + subs w5, w5, #4 + + ld1 {v20.4s}, [x0], x8 + + fmla v1.4s, v4.4s, v7.4s + fmls v0.4s, v4.4s, v5.4s + + ld1 {v21.4s}, [x1], x7 + + ext v1.16b, v1.16b, v1.16b, #8 + fmul v0.4s, v0.4s, v31.s[0] + + ld1 {v2.4s}, [x2], x8 + + rev64 v1.4s, v1.4s + fmul v1.4s, v1.4s, v31.s[0] + + ld1 {v3.4s}, [x3], x7 + + zip1 v5.4s, v0.4s, v1.4s + zip2 v7.4s, v0.4s, v1.4s + + uzp1 v4.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].re + uzp2 v6.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].im + + st1 {v5.4s}, [x10], x8 + st1 {v7.4s}, [x11], x7 + + uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re + uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im + + fmul v1.4s, v6.4s, v5.4s + fmul v0.4s, v6.4s, v7.4s + b.gt 2b + + fmla v1.4s, v4.4s, v7.4s + fmls v0.4s, v4.4s, v5.4s + ext v1.16b, v1.16b, v1.16b, #8 + fmul v0.4s, v0.4s, v31.s[0] + rev64 v1.4s, v1.4s + fmul v1.4s, v1.4s, v31.s[0] + zip1 v5.4s, v0.4s, v1.4s + zip2 v7.4s, v0.4s, v1.4s + st1 {v5.4s}, [x10], x8 + st1 {v7.4s}, [x11], x7 + + ldp x21, x30, [sp] + add sp, sp, #0x20 + ret +endfunc + +// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5) +const fact5, align=4 + .float 0.30901699437494745, 0.95105651629515353 + .float -0.80901699437494734, 0.58778525229247325 +endconst diff --git a/libavcodec/aarch64/opus_imdct_init.c b/libavcodec/aarch64/opus_imdct_init.c deleted file mode 100644 index 1a776dca00..0000000000 --- a/libavcodec/aarch64/opus_imdct_init.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavutil/cpu.h" -#include "libavutil/aarch64/cpu.h" -#include "libavutil/internal.h" -#include "libavcodec/opus_imdct.h" - -#include "asm-offsets.h" - -AV_CHECK_OFFSET(CeltIMDCTContext, exptab, CELT_EXPTAB); -AV_CHECK_OFFSET(CeltIMDCTContext, fft_n, CELT_FFT_N); -AV_CHECK_OFFSET(CeltIMDCTContext, len2, CELT_LEN2); -AV_CHECK_OFFSET(CeltIMDCTContext, len4, CELT_LEN4); -AV_CHECK_OFFSET(CeltIMDCTContext, tmp, CELT_TMP); -AV_CHECK_OFFSET(CeltIMDCTContext, twiddle_exptab, CELT_TWIDDLE); - -void ff_celt_imdct_half_neon(CeltIMDCTContext *s, float *dst, const float *src, - ptrdiff_t stride, float scale); - -void ff_celt_imdct_init_aarch64(CeltIMDCTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - s->imdct_half = ff_celt_imdct_half_neon; - } -} diff --git a/libavcodec/aarch64/opus_imdct_neon.S b/libavcodec/aarch64/opus_imdct_neon.S deleted file mode 100644 index d99edf4108..0000000000 --- a/libavcodec/aarch64/opus_imdct_neon.S +++ /dev/null @@ -1,647 +0,0 @@ -/* - * Copyright (c) 2014 Janne Grunau - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/aarch64/asm.S" - -#include "asm-offsets.h" - -.macro shuffle a, b, c, d -const shuffle_\a\b\c\d, align=4 - .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3) - .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3) - .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3) - .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3) -endconst -.endm - -shuffle 0, 2, 1, 3 -shuffle 1, 0, 3, 2 -shuffle 2, 3, 0, 1 -shuffle 3, 1, 2, 0 - - -function fft5_neon - lsl x2, x2, #3 - ld1 {v24.2s}, [x1], x2 - ld2 {v25.s,v26.s}[0], [x1], x2 - ld2 {v25.s,v26.s}[1], [x1], x2 - ld2 {v25.s,v26.s}[2], [x1], x2 - ld2 {v25.s,v26.s}[3], [x1] - dup v6.4s, v24.s[0] - dup v7.4s, v24.s[1] - - faddp v0.4s, v25.4s, v26.4s - // z[][0], z[][3] - fmul v16.4s, v25.4s, v15.s[0] // rr - fmul v17.4s, v25.4s, v15.s[1] // ri - fmul v18.4s, v26.4s, v15.s[0] // ir - fmul v19.4s, v26.4s, v15.s[1] // ii - faddp v0.4s, v0.4s, v0.4s - // z[][1], z[][2] - fmul v20.4s, v25.4s, v15.s[2] // rr - fmul v21.4s, v25.4s, v15.s[3] // ri - fmul v22.4s, v26.4s, v15.s[2] // ir - fmul v23.4s, v26.4s, v15.s[3] // ii - fadd v0.2s, v24.2s, v0.2s // out[0] - - // z[0123][0], z[0123][3] - fsub v24.4s, v16.4s, v19.4s // (c).re = rr - ii; - fadd v27.4s, v16.4s, v19.4s // (d).re = rr + ii; - ld1 {v16.16b}, [x11] - ld1 {v19.16b}, [x14] - fadd v28.4s, v17.4s, v18.4s // (c).im = ri + ir; - fsub v31.4s, v18.4s, v17.4s // (d).im = -ri + ir; - ld1 {v17.16b}, [x12] - // z[0123][1], z[0123][2] - fsub v25.4s, v20.4s, v23.4s // (c).re = rr - ii; - fadd v26.4s, v20.4s, v23.4s // (d).re = rr + ii; - ld1 {v18.16b}, [x13] - fadd v29.4s, v21.4s, v22.4s // (c).im = ri + ir; - fsub v30.4s, v22.4s, v21.4s // (d).im = -ri + ir; - - //real - tbl v20.16b, {v24.16b}, v16.16b - tbl v21.16b, {v25.16b}, v17.16b - tbl v22.16b, {v26.16b}, v18.16b - tbl v23.16b, {v27.16b}, v19.16b - //imag - tbl v16.16b, {v28.16b}, v16.16b - tbl v17.16b, {v29.16b}, v17.16b - tbl v18.16b, {v30.16b}, v18.16b - tbl v19.16b, {v31.16b}, v19.16b - - fadd v6.4s, v6.4s, v20.4s - fadd v22.4s, v22.4s, v23.4s - fadd v7.4s, v7.4s, v16.4s - fadd v18.4s, v18.4s, v19.4s - - fadd v21.4s, v21.4s, v22.4s - fadd v17.4s, v17.4s, v18.4s - fadd v6.4s, v6.4s, v21.4s - fadd v7.4s, v7.4s, v17.4s - - ret -endfunc - -function fft15_neon - mov x8, x1 - mov x9, x30 - add x2, x3, x3, lsl #1 // 3 * stride - - add x1, x8, x3, lsl #3 // in + 1 * stride - bl fft5_neon - mov v1.8b, v0.8b - mov v2.16b, v6.16b - mov v3.16b, v7.16b - - add x1, x8, x3, lsl #4 // in + 2 * stride - add x2, x3, x3, lsl #1 // 3 * stride - bl fft5_neon - zip1 v1.4s, v1.4s, v0.4s - mov v4.16b, v6.16b - mov v5.16b, v7.16b - - mov x1, x8 // in + 0 * stride - add x2, x3, x3, lsl #1 // 3 * stride - bl fft5_neon - - faddp v20.4s, v1.4s, v1.4s - - ext v18.16b, v8.16b, v8.16b, #4 - ext v19.16b, v9.16b, v9.16b, #4 - mov v16.16b, v6.16b - mov v17.16b, v7.16b - fadd v20.2s, v20.2s, v0.2s - - uzp1 v18.4s, v18.4s, v10.4s // exp[2,4,6,8].re - uzp1 v19.4s, v19.4s, v11.4s // exp[2,4,6,8].im - - st1 {v20.2s}, [x0], #8 // out[0] - - fmla v16.4s, v2.4s, v8.4s - fmls v16.4s, v3.4s, v9.4s - - fmla v17.4s, v2.4s, v9.4s - fmla v17.4s, v3.4s, v8.4s - - fmla v16.4s, v4.4s, v18.4s - fmls v16.4s, v5.4s, v19.4s - - fmla v17.4s, v4.4s, v19.4s - fmla v17.4s, v5.4s, v18.4s - - zip1 v18.4s, v16.4s, v17.4s - zip2 v19.4s, v16.4s, v17.4s - - rev64 v31.4s, v14.4s - trn1 v28.2d, v1.2d, v1.2d - trn2 v29.2d, v1.2d, v1.2d - zip1 v30.2d, v14.2d, v31.2d - zip2 v31.2d, v14.2d, v31.2d - - st1 {v18.4s,v19.4s}, [x0], #32 // out[1-4] - - fmul v16.4s, v28.4s, v30.4s - fmul v17.4s, v29.4s, v30.4s - fmls v16.4s, v29.4s, v31.4s - fmla v17.4s, v28.4s, v31.4s - faddp v16.4s, v16.4s, v16.4s - faddp v17.4s, v17.4s, v17.4s - zip1 v18.2s, v16.2s, v17.2s - zip2 v19.2s, v16.2s, v17.2s - - fadd v18.2s, v18.2s, v0.2s - fadd v0.2s, v19.2s, v0.2s - - ext v30.16b, v12.16b, v12.16b, #4 - ext v31.16b, v13.16b, v13.16b, #4 - mov v16.16b, v6.16b - mov v17.16b, v7.16b - - uzp1 v30.4s, v30.4s, v8.4s - uzp1 v31.4s, v31.4s, v9.4s - - st1 {v18.2s}, [x0], #8 // out[5] - - fmla v16.4s, v2.4s, v10.4s - fmls v16.4s, v3.4s, v11.4s - - fmla v17.4s, v2.4s, v11.4s - fmla v17.4s, v3.4s, v10.4s - - fmla v16.4s, v4.4s, v30.4s - fmls v16.4s, v5.4s, v31.4s - - fmla v17.4s, v4.4s, v31.4s - fmla v17.4s, v5.4s, v30.4s - - zip1 v18.4s, v16.4s, v17.4s - zip2 v19.4s, v16.4s, v17.4s - - ext v30.16b, v10.16b, v10.16b, #4 - ext v31.16b, v11.16b, v11.16b, #4 - - fmla v6.4s, v2.4s, v12.4s - fmls v6.4s, v3.4s, v13.4s - - st1 {v18.4s,v19.4s}, [x0], #32 // out[6-9] - - uzp1 v30.4s, v30.4s, v12.4s - uzp1 v31.4s, v31.4s, v13.4s - - fmla v7.4s, v2.4s, v13.4s - fmla v7.4s, v3.4s, v12.4s - - st1 {v0.2s}, [x0], #8 // out[10] - - fmla v6.4s, v4.4s, v30.4s - fmls v6.4s, v5.4s, v31.4s - - fmla v7.4s, v4.4s, v31.4s - fmla v7.4s, v5.4s, v30.4s - - zip1 v18.4s, v6.4s, v7.4s - zip2 v19.4s, v6.4s, v7.4s - - st1 {v18.4s,v19.4s}, [x0], #32 // out[11-14] - - ret x9 -endfunc - -// x0: out, x1: out+len2, x2: exptab, x3: len2 -function fft15_pass - ands x6, x3, #3 - mov x4, x0 - mov x5, x1 - b.eq 9f - ld1 {v0.2s}, [x0], #8 - ld1 {v1.2s}, [x1], #8 - sub x3, x3, x6 - subs x6, x6, #1 - fadd v2.2s, v0.2s, v1.2s - fsub v3.2s, v0.2s, v1.2s - add x2, x2, #8 - st1 {v2.2s}, [x4], #8 - st1 {v3.2s}, [x5], #8 - b.eq 9f -1: - subs x6, x6, #1 - ldp s4, s5, [x2], #8 - ldp s2, s3, [x1], #8 - ldp s0, s1, [x0], #8 - - fmul s6, s2, s4 - fmul s7, s2, s5 - fmls s6, s3, v5.s[0] - fmla s7, s3, v4.s[0] - - fsub s2, s0, s6 - fsub s3, s1, s7 - fadd s0, s0, s6 - fadd s1, s1, s7 - - stp s2, s3, [x5], #8 - stp s0, s1, [x4], #8 - b.gt 1b -9: - ld1 {v4.4s,v5.4s}, [x2], #32 - ld2 {v2.4s,v3.4s}, [x1], #32 - uzp1 v6.4s, v4.4s, v5.4s - uzp2 v7.4s, v4.4s, v5.4s - ld2 {v0.4s,v1.4s}, [x0], #32 -8: - subs x3, x3, #8 - - fmul v4.4s, v2.4s, v6.4s - fmul v5.4s, v2.4s, v7.4s - b.lt 4f - - ld1 {v18.4s,v19.4s}, [x2], #32 - - fmls v4.4s, v3.4s, v7.4s - fmla v5.4s, v3.4s, v6.4s - - ld2 {v22.4s,v23.4s}, [x1], #32 - - fsub v2.4s, v0.4s, v4.4s - fadd v0.4s, v0.4s, v4.4s - fsub v3.4s, v1.4s, v5.4s - fadd v1.4s, v1.4s, v5.4s - - uzp1 v16.4s, v18.4s, v19.4s - uzp2 v17.4s, v18.4s, v19.4s - - st2 {v2.4s,v3.4s}, [x5], #32 - st2 {v0.4s,v1.4s}, [x4], #32 - ld2 {v20.4s,v21.4s}, [x0], #32 - - fmul v18.4s, v22.4s, v16.4s - fmul v19.4s, v22.4s, v17.4s - b.eq 0f - - ld1 {v4.4s,v5.4s}, [x2], #32 - - fmls v18.4s, v23.4s, v17.4s - fmla v19.4s, v23.4s, v16.4s - - ld2 {v2.4s,v3.4s}, [x1], #32 - - fsub v22.4s, v20.4s, v18.4s - fadd v20.4s, v20.4s, v18.4s - fsub v23.4s, v21.4s, v19.4s - fadd v21.4s, v21.4s, v19.4s - - uzp1 v6.4s, v4.4s, v5.4s - uzp2 v7.4s, v4.4s, v5.4s - - st2 {v22.4s,v23.4s}, [x5], #32 - st2 {v20.4s,v21.4s}, [x4], #32 - ld2 {v0.4s,v1.4s}, [x0], #32 - - b 8b -4: - fmls v4.4s, v3.4s, v7.4s - fmla v5.4s, v3.4s, v6.4s - - fsub v2.4s, v0.4s, v4.4s - fadd v0.4s, v0.4s, v4.4s - fsub v3.4s, v1.4s, v5.4s - fadd v1.4s, v1.4s, v5.4s - - st2 {v2.4s,v3.4s}, [x5], #32 - st2 {v0.4s,v1.4s}, [x4], #32 - - ret -0: - fmls v18.4s, v23.4s, v17.4s - fmla v19.4s, v23.4s, v16.4s - - fsub v22.4s, v20.4s, v18.4s - fadd v20.4s, v20.4s, v18.4s - fsub v23.4s, v21.4s, v19.4s - fadd v21.4s, v21.4s, v19.4s - - st2 {v22.4s,v23.4s}, [x5], #32 - st2 {v20.4s,v21.4s}, [x4], #32 - - ret -endfunc - -function fft30_neon, align=6 - sub sp, sp, #0x20 - stp x20, x21, [sp] - stp x22, x30, [sp, #0x10] - mov x21, x1 - mov x22, x2 - mov x20, x4 - mov x0, x21 - mov x1, x22 - lsl x3, x20, #1 - bl fft15_neon - - add x0, x21, #15*8 - add x1, x22, x20, lsl #3 - lsl x3, x20, #1 - bl fft15_neon - - ldr x2, [x10, #(CELT_EXPTAB + 8)] // s->exptab[1] - add x0, x21, #0 - add x1, x21, #15*8 - mov x3, #15 - ldp x20, x21, [sp] - ldp x22, x30, [sp, #0x10] - add sp, sp, #0x20 - b fft15_pass -endfunc - -.macro def_fft n, n2 -function fft\n\()_neon, align=6 - sub sp, sp, #0x30 - stp x20, x21, [sp] - stp x22, x30, [sp, #0x10] - stp x23, x24, [sp, #0x20] - mov x21, x1 - mov x22, x2 - mov x23, x3 - mov x20, x4 - sub x3, x3, #1 - lsl x4, x4, #1 - bl fft\n2\()_neon - - add x1, x21, #(\n2 * 8) - add x2, x22, x20, lsl #3 - sub x3, x23, #1 - lsl x4, x20, #1 - bl fft\n2\()_neon - - add x5, x10, #CELT_EXPTAB - mov x0, x21 - ldr x2, [x5, x23, lsl #3] // s->exptab[N] - add x1, x21, #(\n2 * 8) - mov x3, #\n2 - ldp x20, x21, [sp] - ldp x22, x30, [sp, #0x10] - ldp x23, x24, [sp, #0x20] - add sp, sp, #0x30 - b fft15_pass -endfunc -.endm - - def_fft 60, 30 - def_fft 120, 60 - def_fft 240, 120 - def_fft 480, 240 - def_fft 960, 480 - -function fft_b15_calc_neon - sub sp, sp, #0x50 - ldr x8, [x0, #CELT_EXPTAB] // s->exptab[0] - movrel x6, fact5 - movrel x11, shuffle_0213 - movrel x12, shuffle_1032 - movrel x13, shuffle_2301 - movrel x14, shuffle_3120 - add x8, x8, #8 - movrel x5, fft_tab_neon - stp x20, x30, [sp] - stp d8, d9, [sp, #0x10] - stp d10, d11, [sp, #0x20] - stp d12, d13, [sp, #0x30] - stp d14, d15, [sp, #0x40] - ld1 {v15.4s}, [x6] - ld1 {v0.4s,v1.4s}, [x8], #32 - ld1 {v6.2s}, [x8], #8 - ld1 {v2.4s,v3.4s}, [x8], #32 - ld1 {v7.2s}, [x8], #8 - ld1 {v4.4s,v5.4s}, [x8], #32 - uzp1 v8.4s, v0.4s, v1.4s // exp[ 1 - 4].re - uzp2 v9.4s, v0.4s, v1.4s // exp[ 1 - 4].im - uzp1 v10.4s, v2.4s, v3.4s // exp[ 6 - 9].re - uzp2 v11.4s, v2.4s, v3.4s // exp[ 6 - 9].im - uzp1 v12.4s, v4.4s, v5.4s // exp[11 - 14].re - uzp2 v13.4s, v4.4s, v5.4s // exp[11 - 14].im - zip1 v14.4s, v6.4s, v7.4s // exp[5,10].re/exp[5,10].im - add x5, x5, x3, lsl #3 - ldr x5, [x5] - mov x10, x0 - blr x5 - ldp x20, x30, [sp] - ldp d8, d9, [sp, #0x10] - ldp d10, d11, [sp, #0x20] - ldp d12, d13, [sp, #0x30] - ldp d14, d15, [sp, #0x40] - add sp, sp, #0x50 - ret -endfunc - -const fft_tab_neon, relocate=1 - .quad fft15_neon - .quad fft30_neon - .quad fft60_neon - .quad fft120_neon - .quad fft240_neon - .quad fft480_neon - .quad fft960_neon -endconst - -function ff_celt_imdct_half_neon, export=1 - sub sp, sp, #0x20 - stp x21, x30, [sp] - str s0, [sp, #0x10] - - ldp w5, w6, [x0, #CELT_LEN2] // CELT_LEN4 - mov x10, x0 - mov x21, x1 - sub w5, w5, #1 - lsl x7, x3, #3 // 2 * stride * sizeof(float) - sub x8, xzr, x3, lsl #3 // -2 * stride * sizeof(float) - mul x5, x5, x3 - ldp x9, x10, [x0, #CELT_TMP] // CELT_TWIDDLE - ldr w3, [x0, #CELT_FFT_N] - add x5, x2, x5, lsl #2 - mov x11, x9 - - sub w6, w6, #4 - ld1 {v0.s}[0], [x5], x8 - ld1 {v1.s}[0], [x2], x7 - ld1 {v4.4s,v5.4s}, [x10], #32 - ld1 {v0.s}[1], [x5], x8 - ld1 {v1.s}[1], [x2], x7 - uzp1 v2.4s, v4.4s, v5.4s - ld1 {v0.s}[2], [x5], x8 - ld1 {v1.s}[2], [x2], x7 - uzp2 v3.4s, v4.4s, v5.4s - ld1 {v0.s}[3], [x5], x8 - ld1 {v1.s}[3], [x2], x7 -1: - subs w6, w6, #4 - - ld1 {v20.s}[0], [x5], x8 - ld1 {v21.s}[0], [x2], x7 - ld1 {v4.4s,v5.4s}, [x10], #32 - - fmul v6.4s, v0.4s, v2.4s - fmul v7.4s, v0.4s, v3.4s - - ld1 {v20.s}[1], [x5], x8 - ld1 {v21.s}[1], [x2], x7 - - fmls v6.4s, v1.4s, v3.4s - fmla v7.4s, v1.4s, v2.4s - - ld1 {v20.s}[2], [x5], x8 - ld1 {v21.s}[2], [x2], x7 - - uzp1 v2.4s, v4.4s, v5.4s - uzp2 v3.4s, v4.4s, v5.4s - ld1 {v20.s}[3], [x5], x8 - ld1 {v21.s}[3], [x2], x7 - - zip1 v4.4s, v6.4s, v7.4s - zip2 v5.4s, v6.4s, v7.4s - - fmul v6.4s, v20.4s, v2.4s - fmul v7.4s, v20.4s, v3.4s - - st1 {v4.4s,v5.4s}, [x9], #32 - - fmls v6.4s, v21.4s, v3.4s - fmla v7.4s, v21.4s, v2.4s - - b.eq 3f - - subs w6, w6, #4 - ld1 {v4.4s,v5.4s}, [x10], #32 - ld1 {v0.s}[0], [x5], x8 - ld1 {v1.s}[0], [x2], x7 - uzp1 v2.4s, v4.4s, v5.4s - ld1 {v0.s}[1], [x5], x8 - ld1 {v1.s}[1], [x2], x7 - uzp2 v3.4s, v4.4s, v5.4s - ld1 {v0.s}[2], [x5], x8 - ld1 {v1.s}[2], [x2], x7 - zip1 v4.4s, v6.4s, v7.4s - zip2 v5.4s, v6.4s, v7.4s - ld1 {v0.s}[3], [x5], x8 - ld1 {v1.s}[3], [x2], x7 - - st1 {v4.4s,v5.4s}, [x9], #32 - - b.gt 1b - - fmul v6.4s, v0.4s, v2.4s - fmul v7.4s, v0.4s, v3.4s - fmls v6.4s, v1.4s, v3.4s - fmla v7.4s, v1.4s, v2.4s -3: - zip1 v4.4s, v6.4s, v7.4s - zip2 v5.4s, v6.4s, v7.4s - st1 {v4.4s,v5.4s}, [x9], #32 - - mov x2, x11 - mov x4, #1 - - bl fft_b15_calc_neon - - ldr w5, [x10, #CELT_LEN4] - ldr x6, [x10, #CELT_TWIDDLE] - ldr s31, [sp, #0x10] - - add x1, x21, x5, lsl #2 - add x3, x6, x5, lsl #2 - sub x0, x1, #16 - sub x2, x3, #16 - mov x8, #-16 - mov x7, #16 - mov x10, x0 - mov x11, x1 - - sub w5, w5, #4 - - ld1 {v0.4s}, [x0], x8 - ld1 {v1.4s}, [x1], x7 - ld1 {v2.4s}, [x2], x8 - ld1 {v3.4s}, [x3], x7 - - uzp1 v4.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].re - uzp2 v6.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].im - - uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re - uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im - - fmul v1.4s, v6.4s, v5.4s - fmul v0.4s, v6.4s, v7.4s -2: - subs w5, w5, #4 - - ld1 {v20.4s}, [x0], x8 - - fmla v1.4s, v4.4s, v7.4s - fmls v0.4s, v4.4s, v5.4s - - ld1 {v21.4s}, [x1], x7 - - ext v1.16b, v1.16b, v1.16b, #8 - fmul v0.4s, v0.4s, v31.s[0] - - ld1 {v2.4s}, [x2], x8 - - rev64 v1.4s, v1.4s - fmul v1.4s, v1.4s, v31.s[0] - - ld1 {v3.4s}, [x3], x7 - - zip1 v5.4s, v0.4s, v1.4s - zip2 v7.4s, v0.4s, v1.4s - - uzp1 v4.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].re - uzp2 v6.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].im - - st1 {v5.4s}, [x10], x8 - st1 {v7.4s}, [x11], x7 - - uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re - uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im - - fmul v1.4s, v6.4s, v5.4s - fmul v0.4s, v6.4s, v7.4s - b.gt 2b - - fmla v1.4s, v4.4s, v7.4s - fmls v0.4s, v4.4s, v5.4s - ext v1.16b, v1.16b, v1.16b, #8 - fmul v0.4s, v0.4s, v31.s[0] - rev64 v1.4s, v1.4s - fmul v1.4s, v1.4s, v31.s[0] - zip1 v5.4s, v0.4s, v1.4s - zip2 v7.4s, v0.4s, v1.4s - st1 {v5.4s}, [x10], x8 - st1 {v7.4s}, [x11], x7 - - ldp x21, x30, [sp] - add sp, sp, #0x20 - ret -endfunc - -// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5) -const fact5, align=4 - .float 0.30901699437494745, 0.95105651629515353 - .float -0.80901699437494734, 0.58778525229247325 -endconst -- cgit v1.2.3