summaryrefslogtreecommitdiff
path: root/libavcodec/aarch64
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r--libavcodec/aarch64/Makefile25
-rw-r--r--libavcodec/aarch64/aacpsdsp_init_aarch64.c48
-rw-r--r--libavcodec/aarch64/aacpsdsp_neon.S148
-rw-r--r--libavcodec/aarch64/asm-offsets.h8
-rw-r--r--libavcodec/aarch64/cabac.h8
-rw-r--r--libavcodec/aarch64/dcadsp_neon.S109
-rw-r--r--libavcodec/aarch64/fft_init_aarch64.c20
-rw-r--r--libavcodec/aarch64/fft_neon.S8
-rw-r--r--libavcodec/aarch64/fmtconvert_init.c8
-rw-r--r--libavcodec/aarch64/fmtconvert_neon.S8
-rw-r--r--libavcodec/aarch64/h264chroma_init_aarch64.c8
-rw-r--r--libavcodec/aarch64/h264cmc_neon.S10
-rw-r--r--libavcodec/aarch64/h264dsp_init_aarch64.c46
-rw-r--r--libavcodec/aarch64/h264dsp_neon.S8
-rw-r--r--libavcodec/aarch64/h264idct_neon.S28
-rw-r--r--libavcodec/aarch64/h264pred_init.c8
-rw-r--r--libavcodec/aarch64/h264pred_neon.S8
-rw-r--r--libavcodec/aarch64/h264qpel_init_aarch64.c8
-rw-r--r--libavcodec/aarch64/h264qpel_neon.S8
-rw-r--r--libavcodec/aarch64/hpeldsp_init_aarch64.c8
-rw-r--r--libavcodec/aarch64/hpeldsp_neon.S8
-rw-r--r--libavcodec/aarch64/idct.h28
-rw-r--r--libavcodec/aarch64/idctdsp_init_aarch64.c41
-rw-r--r--libavcodec/aarch64/imdct15_init.c46
-rw-r--r--libavcodec/aarch64/imdct15_neon.S647
-rw-r--r--libavcodec/aarch64/mdct_init.c39
-rw-r--r--libavcodec/aarch64/mdct_neon.S8
-rw-r--r--libavcodec/aarch64/mpegaudiodsp_init.c8
-rw-r--r--libavcodec/aarch64/mpegaudiodsp_neon.S8
-rw-r--r--libavcodec/aarch64/neon.S8
-rw-r--r--libavcodec/aarch64/neontest.c8
-rw-r--r--libavcodec/aarch64/rv40dsp_init_aarch64.c8
-rw-r--r--libavcodec/aarch64/sbrdsp_init_aarch64.c70
-rw-r--r--libavcodec/aarch64/sbrdsp_neon.S327
-rw-r--r--libavcodec/aarch64/simple_idct_neon.S362
-rw-r--r--libavcodec/aarch64/synth_filter_init.c (renamed from libavcodec/aarch64/dcadsp_init.c)23
-rw-r--r--libavcodec/aarch64/synth_filter_neon.S8
-rw-r--r--libavcodec/aarch64/vc1dsp_init_aarch64.c8
-rw-r--r--libavcodec/aarch64/videodsp.S8
-rw-r--r--libavcodec/aarch64/videodsp_init.c8
-rw-r--r--libavcodec/aarch64/vorbisdsp_init.c8
-rw-r--r--libavcodec/aarch64/vorbisdsp_neon.S8
-rw-r--r--libavcodec/aarch64/vp8dsp.h8
-rw-r--r--libavcodec/aarch64/vp8dsp_init_aarch64.c8
-rw-r--r--libavcodec/aarch64/vp8dsp_neon.S8
-rw-r--r--libavcodec/aarch64/vp9dsp_init.h29
-rw-r--r--libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c23
-rw-r--r--libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c23
-rw-r--r--libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c273
-rw-r--r--libavcodec/aarch64/vp9dsp_init_aarch64.c23
-rw-r--r--libavcodec/aarch64/vp9itxfm_16bpp_neon.S2017
-rw-r--r--libavcodec/aarch64/vp9itxfm_neon.S8
-rw-r--r--libavcodec/aarch64/vp9lpf_16bpp_neon.S873
-rw-r--r--libavcodec/aarch64/vp9lpf_neon.S8
-rw-r--r--libavcodec/aarch64/vp9mc_16bpp_neon.S631
-rw-r--r--libavcodec/aarch64/vp9mc_neon.S25
56 files changed, 5123 insertions, 1051 deletions
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 7228eaea24..8bc8bc528c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -6,19 +6,21 @@ OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
-OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_init.o
-OBJS-$(CONFIG_MDCT) += aarch64/mdct_init.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o
# decoders/encoders
-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_init.o
+OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_init_aarch64.o \
+ aarch64/sbrdsp_init_aarch64.o
+OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_init.o
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
-OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o
+OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_init_aarch64.o
OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_aarch64.o
+OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9dsp_init_10bpp_aarch64.o \
+ aarch64/vp9dsp_init_12bpp_aarch64.o \
+ aarch64/vp9dsp_init_aarch64.o
# ARMv8 optimizations
@@ -28,6 +30,7 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
# NEON optimizations
# subsystems
+NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
@@ -37,15 +40,19 @@ NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
-NEON-OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_neon.o
+NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o \
+ aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
# decoders/encoders
-NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \
- aarch64/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
-NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_neon.o \
+NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
+ aarch64/vp9itxfm_neon.o \
+ aarch64/vp9lpf_16bpp_neon.o \
aarch64/vp9lpf_neon.o \
+ aarch64/vp9mc_16bpp_neon.o \
aarch64/vp9mc_neon.o
diff --git a/libavcodec/aarch64/aacpsdsp_init_aarch64.c b/libavcodec/aarch64/aacpsdsp_init_aarch64.c
new file mode 100644
index 0000000000..5e7e19bba4
--- /dev/null
+++ b/libavcodec/aarch64/aacpsdsp_init_aarch64.c
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
+ float *src1, int n);
+void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
+ const float (*filter)[8][2],
+ ptrdiff_t stride, int n);
+void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
+ float h[2][4], float h_step[2][4],
+ int len);
+void ff_ps_stereo_interpolate_ipdopd_neon(float (*l)[2], float (*r)[2],
+ float h[2][4], float h_step[2][4],
+ int len);
+
+av_cold void ff_psdsp_init_aarch64(PSDSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ s->add_squares = ff_ps_add_squares_neon;
+ s->mul_pair_single = ff_ps_mul_pair_single_neon;
+ s->hybrid_analysis = ff_ps_hybrid_analysis_neon;
+ s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
+ s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_neon;
+ }
+}
diff --git a/libavcodec/aarch64/aacpsdsp_neon.S b/libavcodec/aarch64/aacpsdsp_neon.S
new file mode 100644
index 0000000000..ff4e6e244a
--- /dev/null
+++ b/libavcodec/aarch64/aacpsdsp_neon.S
@@ -0,0 +1,148 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_ps_add_squares_neon, export=1
+1: ld1 {v0.4S,v1.4S}, [x1], #32
+ fmul v0.4S, v0.4S, v0.4S
+ fmul v1.4S, v1.4S, v1.4S
+ faddp v2.4S, v0.4S, v1.4S
+ ld1 {v3.4S}, [x0]
+ fadd v3.4S, v3.4S, v2.4S
+ st1 {v3.4S}, [x0], #16
+ subs w2, w2, #4
+ b.gt 1b
+ ret
+endfunc
+
+function ff_ps_mul_pair_single_neon, export=1
+1: ld1 {v0.4S,v1.4S}, [x1], #32
+ ld1 {v2.4S}, [x2], #16
+ zip1 v3.4S, v2.4S, v2.4S
+ zip2 v4.4S, v2.4S, v2.4S
+ fmul v0.4S, v0.4S, v3.4S
+ fmul v1.4S, v1.4S, v4.4S
+ st1 {v0.4S,v1.4S}, [x0], #32
+ subs w3, w3, #4
+ b.gt 1b
+ ret
+endfunc
+
+function ff_ps_stereo_interpolate_neon, export=1
+ ld1 {v0.4S}, [x2]
+ ld1 {v1.4S}, [x3]
+ zip1 v4.4S, v0.4S, v0.4S
+ zip2 v5.4S, v0.4S, v0.4S
+ zip1 v6.4S, v1.4S, v1.4S
+ zip2 v7.4S, v1.4S, v1.4S
+1: ld1 {v2.2S}, [x0]
+ ld1 {v3.2S}, [x1]
+ fadd v4.4S, v4.4S, v6.4S
+ fadd v5.4S, v5.4S, v7.4S
+ mov v2.D[1], v2.D[0]
+ mov v3.D[1], v3.D[0]
+ fmul v2.4S, v2.4S, v4.4S
+ fmla v2.4S, v3.4S, v5.4S
+ st1 {v2.D}[0], [x0], #8
+ st1 {v2.D}[1], [x1], #8
+ subs w4, w4, #1
+ b.gt 1b
+ ret
+endfunc
+
+function ff_ps_stereo_interpolate_ipdopd_neon, export=1
+ ld1 {v0.4S,v1.4S}, [x2]
+ ld1 {v6.4S,v7.4S}, [x3]
+ fneg v2.4S, v1.4S
+ fneg v3.4S, v7.4S
+ zip1 v16.4S, v0.4S, v0.4S
+ zip2 v17.4S, v0.4S, v0.4S
+ zip1 v18.4S, v2.4S, v1.4S
+ zip2 v19.4S, v2.4S, v1.4S
+ zip1 v20.4S, v6.4S, v6.4S
+ zip2 v21.4S, v6.4S, v6.4S
+ zip1 v22.4S, v3.4S, v7.4S
+ zip2 v23.4S, v3.4S, v7.4S
+1: ld1 {v2.2S}, [x0]
+ ld1 {v3.2S}, [x1]
+ fadd v16.4S, v16.4S, v20.4S
+ fadd v17.4S, v17.4S, v21.4S
+ mov v2.D[1], v2.D[0]
+ mov v3.D[1], v3.D[0]
+ fmul v4.4S, v2.4S, v16.4S
+ fmla v4.4S, v3.4S, v17.4S
+ fadd v18.4S, v18.4S, v22.4S
+ fadd v19.4S, v19.4S, v23.4S
+ ext v2.16B, v2.16B, v2.16B, #4
+ ext v3.16B, v3.16B, v3.16B, #4
+ fmla v4.4S, v2.4S, v18.4S
+ fmla v4.4S, v3.4S, v19.4S
+ st1 {v4.D}[0], [x0], #8
+ st1 {v4.D}[1], [x1], #8
+ subs w4, w4, #1
+ b.gt 1b
+ ret
+endfunc
+
+function ff_ps_hybrid_analysis_neon, export=1
+ lsl x3, x3, #3
+ ld2 {v0.4S,v1.4S}, [x1], #32
+ ld2 {v2.2S,v3.2S}, [x1], #16
+ ld1 {v24.2S}, [x1], #8
+ ld2 {v4.2S,v5.2S}, [x1], #16
+ ld2 {v6.4S,v7.4S}, [x1]
+ rev64 v6.4S, v6.4S
+ rev64 v7.4S, v7.4S
+ ext v6.16B, v6.16B, v6.16B, #8
+ ext v7.16B, v7.16B, v7.16B, #8
+ rev64 v4.2S, v4.2S
+ rev64 v5.2S, v5.2S
+ mov v2.D[1], v3.D[0]
+ mov v4.D[1], v5.D[0]
+ mov v5.D[1], v2.D[0]
+ mov v3.D[1], v4.D[0]
+ fadd v16.4S, v0.4S, v6.4S
+ fadd v17.4S, v1.4S, v7.4S
+ fsub v18.4S, v1.4S, v7.4S
+ fsub v19.4S, v0.4S, v6.4S
+ fadd v22.4S, v2.4S, v4.4S
+ fsub v23.4S, v5.4S, v3.4S
+ trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5}
+ trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7}
+1: ld2 {v2.4S,v3.4S}, [x2], #32
+ ld2 {v4.2S,v5.2S}, [x2], #16
+ ld1 {v6.2S}, [x2], #8
+ add x2, x2, #8
+ mov v4.D[1], v5.D[0]
+ mov v6.S[1], v6.S[0]
+ fmul v6.2S, v6.2S, v24.2S
+ fmul v0.4S, v2.4S, v16.4S
+ fmul v1.4S, v2.4S, v17.4S
+ fmls v0.4S, v3.4S, v18.4S
+ fmla v1.4S, v3.4S, v19.4S
+ fmla v0.4S, v4.4S, v20.4S
+ fmla v1.4S, v4.4S, v21.4S
+ faddp v0.4S, v0.4S, v1.4S
+ faddp v0.4S, v0.4S, v0.4S
+ fadd v0.2S, v0.2S, v6.2S
+ st1 {v0.2S}, [x0], x3
+ subs w4, w4, #1
+ b.gt 1b
+ ret
+endfunc
diff --git a/libavcodec/aarch64/asm-offsets.h b/libavcodec/aarch64/asm-offsets.h
index 60e32ddd1d..e05c5ad2e4 100644
--- a/libavcodec/aarch64/asm-offsets.h
+++ b/libavcodec/aarch64/asm-offsets.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/cabac.h b/libavcodec/aarch64/cabac.h
index e12953e86c..6b9b77eb30 100644
--- a/libavcodec/aarch64/cabac.h
+++ b/libavcodec/aarch64/cabac.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/dcadsp_neon.S b/libavcodec/aarch64/dcadsp_neon.S
deleted file mode 100644
index 4cd3328042..0000000000
--- a/libavcodec/aarch64/dcadsp_neon.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/aarch64/asm.S"
-
-function ff_dca_lfe_fir0_neon, export=1
- mov x3, #32 // decifactor
- sub x1, x1, #7*4
- add x4, x0, #2*32*4 - 16 // out2
- mov x7, #-16
-
- ld1 {v0.4s,v1.4s}, [x1]
- // reverse [-num_coeffs + 1, 0]
- ext v3.16b, v0.16b, v0.16b, #8
- ext v2.16b, v1.16b, v1.16b, #8
- rev64 v3.4s, v3.4s
- rev64 v2.4s, v2.4s
-1:
- ld1 {v4.4s,v5.4s}, [x2], #32
- ld1 {v6.4s,v7.4s}, [x2], #32
- subs x3, x3, #4
- fmul v16.4s, v2.4s, v4.4s
- fmul v23.4s, v0.4s, v4.4s
- fmul v17.4s, v2.4s, v6.4s
- fmul v22.4s, v0.4s, v6.4s
-
- fmla v16.4s, v3.4s, v5.4s
- fmla v23.4s, v1.4s, v5.4s
- ld1 {v4.4s,v5.4s}, [x2], #32
- fmla v17.4s, v3.4s, v7.4s
- fmla v22.4s, v1.4s, v7.4s
- ld1 {v6.4s,v7.4s}, [x2], #32
- fmul v18.4s, v2.4s, v4.4s
- fmul v21.4s, v0.4s, v4.4s
- fmul v19.4s, v2.4s, v6.4s
- fmul v20.4s, v0.4s, v6.4s
-
- fmla v18.4s, v3.4s, v5.4s
- fmla v21.4s, v1.4s, v5.4s
- fmla v19.4s, v3.4s, v7.4s
- fmla v20.4s, v1.4s, v7.4s
-
- faddp v16.4s, v16.4s, v17.4s
- faddp v18.4s, v18.4s, v19.4s
- faddp v20.4s, v20.4s, v21.4s
- faddp v22.4s, v22.4s, v23.4s
- faddp v16.4s, v16.4s, v18.4s
- faddp v20.4s, v20.4s, v22.4s
-
- st1 {v16.4s}, [x0], #16
- st1 {v20.4s}, [x4], x7
- b.gt 1b
-
- ret
-endfunc
-
-function ff_dca_lfe_fir1_neon, export=1
- mov x3, #64 // decifactor
- sub x1, x1, #3*4
- add x4, x0, #2*64*4 - 16 // out2
- mov x7, #-16
-
- ld1 {v0.4s}, [x1]
- // reverse [-num_coeffs + 1, 0]
- ext v1.16b, v0.16b, v0.16b, #8
- rev64 v1.4s, v1.4s
-
-1:
- ld1 {v4.4s,v5.4s}, [x2], #32
- ld1 {v6.4s,v7.4s}, [x2], #32
- subs x3, x3, #4
- fmul v16.4s, v1.4s, v4.4s
- fmul v23.4s, v0.4s, v4.4s
- fmul v17.4s, v1.4s, v5.4s
- fmul v22.4s, v0.4s, v5.4s
- fmul v18.4s, v1.4s, v6.4s
- fmul v21.4s, v0.4s, v6.4s
- fmul v19.4s, v1.4s, v7.4s
- fmul v20.4s, v0.4s, v7.4s
- faddp v16.4s, v16.4s, v17.4s
- faddp v18.4s, v18.4s, v19.4s
- faddp v20.4s, v20.4s, v21.4s
- faddp v22.4s, v22.4s, v23.4s
- faddp v16.4s, v16.4s, v18.4s
- faddp v20.4s, v20.4s, v22.4s
- st1 {v16.4s}, [x0], #16
- st1 {v20.4s}, [x4], x7
- b.gt 1b
-
- ret
-endfunc
diff --git a/libavcodec/aarch64/fft_init_aarch64.c b/libavcodec/aarch64/fft_init_aarch64.c
index 9cc57d331e..db285205ab 100644
--- a/libavcodec/aarch64/fft_init_aarch64.c
+++ b/libavcodec/aarch64/fft_init_aarch64.c
@@ -1,23 +1,25 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "config.h"
+
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
@@ -27,6 +29,10 @@
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
av_cold void ff_fft_init_aarch64(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
@@ -34,5 +40,11 @@ av_cold void ff_fft_init_aarch64(FFTContext *s)
if (have_neon(cpu_flags)) {
s->fft_permute = ff_fft_permute_neon;
s->fft_calc = ff_fft_calc_neon;
+#if CONFIG_MDCT
+ s->imdct_calc = ff_imdct_calc_neon;
+ s->imdct_half = ff_imdct_half_neon;
+ s->mdct_calc = ff_mdct_calc_neon;
+ s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
}
}
diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
index e205e23d88..862039f97d 100644
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
@@ -8,20 +8,20 @@
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/fmtconvert_init.c b/libavcodec/aarch64/fmtconvert_init.c
index 0a55a1b88c..210e74b654 100644
--- a/libavcodec/aarch64/fmtconvert_init.c
+++ b/libavcodec/aarch64/fmtconvert_init.c
@@ -1,20 +1,20 @@
/*
* ARM optimized Format Conversion Utils
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/fmtconvert_neon.S b/libavcodec/aarch64/fmtconvert_neon.S
index 3b33c87ade..2161c3a8ae 100644
--- a/libavcodec/aarch64/fmtconvert_neon.S
+++ b/libavcodec/aarch64/fmtconvert_neon.S
@@ -3,20 +3,20 @@
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/h264chroma_init_aarch64.c b/libavcodec/aarch64/h264chroma_init_aarch64.c
index a373291344..fa6e0eaf15 100644
--- a/libavcodec/aarch64/h264chroma_init_aarch64.c
+++ b/libavcodec/aarch64/h264chroma_init_aarch64.c
@@ -2,20 +2,20 @@
* ARM NEON optimised H.264 chroma functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S
index edc256cbc3..8be7578001 100644
--- a/libavcodec/aarch64/h264cmc_neon.S
+++ b/libavcodec/aarch64/h264cmc_neon.S
@@ -2,20 +2,20 @@
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -442,7 +442,7 @@ endconst
h264_chroma_mc4 avg, rv40
#endif
-#if CONFIG_VC1_DECODER
+#if CONFIG_VC1DSP
h264_chroma_mc8 put, vc1
h264_chroma_mc8 avg, vc1
h264_chroma_mc4 put, vc1
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index 07bda2ff07..649d2ab1d7 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -25,39 +25,39 @@
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/h264dsp.h"
-void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta, int8_t *tc0);
-void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta);
-void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta);
-void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
int beta, int8_t *tc0);
-void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
+void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
-void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
+void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
-void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, int stride,
+void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
-void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, ptrdiff_t stride, int height,
int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, ptrdiff_t stride, int height,
int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, ptrdiff_t stride, int height,
int log2_den, int weight, int offset);
-void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int height, int log2_den, int weightd,
int weights, int offset);
-void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int height, int log2_den, int weightd,
int weights, int offset);
-void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int height, int log2_den, int weightd,
int weights, int offset);
@@ -91,10 +91,12 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
- c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+ if (chroma_format_idc <= 1) {
+ c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+ c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
+ c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
+ }
c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
- c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
- c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index 448e575b8c..80ac09d2be 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -3,20 +3,20 @@
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 1c43c1f301..7de44205d3 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -2,20 +2,20 @@
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -23,6 +23,7 @@
#include "neon.S"
function ff_h264_idct_add_neon, export=1
+.L_ff_h264_idct_add_neon:
ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
sxtw x2, w2
movi v30.8H, #0
@@ -77,6 +78,7 @@ function ff_h264_idct_add_neon, export=1
endfunc
function ff_h264_idct_dc_add_neon, export=1
+.L_ff_h264_idct_dc_add_neon:
sxtw x2, w2
mov w3, #0
ld1r {v2.8H}, [x1]
@@ -106,8 +108,8 @@ function ff_h264_idct_add16_neon, export=1
mov w9, w3 // stride
movrel x7, scan8
mov x10, #16
- movrel x13, X(ff_h264_idct_dc_add_neon)
- movrel x14, X(ff_h264_idct_add_neon)
+ movrel x13, .L_ff_h264_idct_dc_add_neon
+ movrel x14, .L_ff_h264_idct_add_neon
1: mov w2, w9
ldrb w3, [x7], #1
ldrsw x0, [x5], #4
@@ -133,8 +135,8 @@ function ff_h264_idct_add16intra_neon, export=1
mov w9, w3 // stride
movrel x7, scan8
mov x10, #16
- movrel x13, X(ff_h264_idct_dc_add_neon)
- movrel x14, X(ff_h264_idct_add_neon)
+ movrel x13, .L_ff_h264_idct_dc_add_neon
+ movrel x14, .L_ff_h264_idct_add_neon
1: mov w2, w9
ldrb w3, [x7], #1
ldrsw x0, [x5], #4
@@ -160,8 +162,8 @@ function ff_h264_idct_add8_neon, export=1
add x5, x1, #16*4 // block_offset
add x9, x2, #16*32 // block
mov w19, w3 // stride
- movrel x13, X(ff_h264_idct_dc_add_neon)
- movrel x14, X(ff_h264_idct_add_neon)
+ movrel x13, .L_ff_h264_idct_dc_add_neon
+ movrel x14, .L_ff_h264_idct_add_neon
movrel x7, scan8, 16
mov x10, #0
mov x11, #16
@@ -263,6 +265,7 @@ endfunc
.endm
function ff_h264_idct8_add_neon, export=1
+.L_ff_h264_idct8_add_neon:
movi v19.8H, #0
sxtw x2, w2
ld1 {v24.8H, v25.8H}, [x1]
@@ -326,6 +329,7 @@ function ff_h264_idct8_add_neon, export=1
endfunc
function ff_h264_idct8_dc_add_neon, export=1
+.L_ff_h264_idct8_dc_add_neon:
mov w3, #0
sxtw x2, w2
ld1r {v31.8H}, [x1]
@@ -375,8 +379,8 @@ function ff_h264_idct8_add4_neon, export=1
mov w2, w3
movrel x7, scan8
mov w10, #16
- movrel x13, X(ff_h264_idct8_dc_add_neon)
- movrel x14, X(ff_h264_idct8_add_neon)
+ movrel x13, .L_ff_h264_idct8_dc_add_neon
+ movrel x14, .L_ff_h264_idct8_add_neon
1: ldrb w9, [x7], #4
ldrsw x0, [x5], #16
ldrb w9, [x4, w9, UXTW]
diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c
index 8f912cbca9..b144376f90 100644
--- a/libavcodec/aarch64/h264pred_init.c
+++ b/libavcodec/aarch64/h264pred_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index a38a27f186..213b40b3e7 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
index 74088b216c..77f41d9a21 100644
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -2,20 +2,20 @@
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index 731dc0658d..d27cfac494 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -2,20 +2,20 @@
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/hpeldsp_init_aarch64.c b/libavcodec/aarch64/hpeldsp_init_aarch64.c
index 6bc4c09f6c..144ae2bcc4 100644
--- a/libavcodec/aarch64/hpeldsp_init_aarch64.c
+++ b/libavcodec/aarch64/hpeldsp_init_aarch64.c
@@ -2,20 +2,20 @@
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/hpeldsp_neon.S b/libavcodec/aarch64/hpeldsp_neon.S
index 29782908f8..a491c173bb 100644
--- a/libavcodec/aarch64/hpeldsp_neon.S
+++ b/libavcodec/aarch64/hpeldsp_neon.S
@@ -3,20 +3,20 @@
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/idct.h b/libavcodec/aarch64/idct.h
new file mode 100644
index 0000000000..5c49046148
--- /dev/null
+++ b/libavcodec/aarch64/idct.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_IDCT_H
+#define AVCODEC_AARCH64_IDCT_H
+
+#include <stdint.h>
+
+void ff_simple_idct_neon(int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+
+#endif /* AVCODEC_AARCH64_IDCT_H */
diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c
new file mode 100644
index 0000000000..0406e60830
--- /dev/null
+++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
@@ -0,0 +1,41 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+
+av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+ unsigned high_bit_depth)
+{
+ if (!avctx->lowres && !high_bit_depth) {
+ if (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+ c->idct_put = ff_simple_idct_put_neon;
+ c->idct_add = ff_simple_idct_add_neon;
+ c->idct = ff_simple_idct_neon;
+ c->perm_type = FF_IDCT_PERM_PARTTRANS;
+ }
+ }
+}
diff --git a/libavcodec/aarch64/imdct15_init.c b/libavcodec/aarch64/imdct15_init.c
deleted file mode 100644
index 38018f2b4a..0000000000
--- a/libavcodec/aarch64/imdct15_init.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-
-#include "libavutil/cpu.h"
-#include "libavutil/aarch64/cpu.h"
-#include "libavutil/internal.h"
-
-#include "libavcodec/imdct15.h"
-
-#include "asm-offsets.h"
-
-AV_CHECK_OFFSET(IMDCT15Context, exptab, CELT_EXPTAB);
-AV_CHECK_OFFSET(IMDCT15Context, fft_n, CELT_FFT_N);
-AV_CHECK_OFFSET(IMDCT15Context, len2, CELT_LEN2);
-AV_CHECK_OFFSET(IMDCT15Context, len4, CELT_LEN4);
-AV_CHECK_OFFSET(IMDCT15Context, tmp, CELT_TMP);
-AV_CHECK_OFFSET(IMDCT15Context, twiddle_exptab, CELT_TWIDDLE);
-
-void ff_celt_imdct_half_neon(IMDCT15Context *s, float *dst, const float *src,
- ptrdiff_t stride, float scale);
-
-void ff_imdct15_init_aarch64(IMDCT15Context *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- s->imdct_half = ff_celt_imdct_half_neon;
- }
-}
diff --git a/libavcodec/aarch64/imdct15_neon.S b/libavcodec/aarch64/imdct15_neon.S
deleted file mode 100644
index d99edf4108..0000000000
--- a/libavcodec/aarch64/imdct15_neon.S
+++ /dev/null
@@ -1,647 +0,0 @@
-/*
- * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/aarch64/asm.S"
-
-#include "asm-offsets.h"
-
-.macro shuffle a, b, c, d
-const shuffle_\a\b\c\d, align=4
- .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
- .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
- .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
- .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
-endconst
-.endm
-
-shuffle 0, 2, 1, 3
-shuffle 1, 0, 3, 2
-shuffle 2, 3, 0, 1
-shuffle 3, 1, 2, 0
-
-
-function fft5_neon
- lsl x2, x2, #3
- ld1 {v24.2s}, [x1], x2
- ld2 {v25.s,v26.s}[0], [x1], x2
- ld2 {v25.s,v26.s}[1], [x1], x2
- ld2 {v25.s,v26.s}[2], [x1], x2
- ld2 {v25.s,v26.s}[3], [x1]
- dup v6.4s, v24.s[0]
- dup v7.4s, v24.s[1]
-
- faddp v0.4s, v25.4s, v26.4s
- // z[][0], z[][3]
- fmul v16.4s, v25.4s, v15.s[0] // rr
- fmul v17.4s, v25.4s, v15.s[1] // ri
- fmul v18.4s, v26.4s, v15.s[0] // ir
- fmul v19.4s, v26.4s, v15.s[1] // ii
- faddp v0.4s, v0.4s, v0.4s
- // z[][1], z[][2]
- fmul v20.4s, v25.4s, v15.s[2] // rr
- fmul v21.4s, v25.4s, v15.s[3] // ri
- fmul v22.4s, v26.4s, v15.s[2] // ir
- fmul v23.4s, v26.4s, v15.s[3] // ii
- fadd v0.2s, v24.2s, v0.2s // out[0]
-
- // z[0123][0], z[0123][3]
- fsub v24.4s, v16.4s, v19.4s // (c).re = rr - ii;
- fadd v27.4s, v16.4s, v19.4s // (d).re = rr + ii;
- ld1 {v16.16b}, [x11]
- ld1 {v19.16b}, [x14]
- fadd v28.4s, v17.4s, v18.4s // (c).im = ri + ir;
- fsub v31.4s, v18.4s, v17.4s // (d).im = -ri + ir;
- ld1 {v17.16b}, [x12]
- // z[0123][1], z[0123][2]
- fsub v25.4s, v20.4s, v23.4s // (c).re = rr - ii;
- fadd v26.4s, v20.4s, v23.4s // (d).re = rr + ii;
- ld1 {v18.16b}, [x13]
- fadd v29.4s, v21.4s, v22.4s // (c).im = ri + ir;
- fsub v30.4s, v22.4s, v21.4s // (d).im = -ri + ir;
-
- //real
- tbl v20.16b, {v24.16b}, v16.16b
- tbl v21.16b, {v25.16b}, v17.16b
- tbl v22.16b, {v26.16b}, v18.16b
- tbl v23.16b, {v27.16b}, v19.16b
- //imag
- tbl v16.16b, {v28.16b}, v16.16b
- tbl v17.16b, {v29.16b}, v17.16b
- tbl v18.16b, {v30.16b}, v18.16b
- tbl v19.16b, {v31.16b}, v19.16b
-
- fadd v6.4s, v6.4s, v20.4s
- fadd v22.4s, v22.4s, v23.4s
- fadd v7.4s, v7.4s, v16.4s
- fadd v18.4s, v18.4s, v19.4s
-
- fadd v21.4s, v21.4s, v22.4s
- fadd v17.4s, v17.4s, v18.4s
- fadd v6.4s, v6.4s, v21.4s
- fadd v7.4s, v7.4s, v17.4s
-
- ret
-endfunc
-
-function fft15_neon
- mov x8, x1
- mov x9, x30
- add x2, x3, x3, lsl #1 // 3 * stride
-
- add x1, x8, x3, lsl #3 // in + 1 * stride
- bl fft5_neon
- mov v1.8b, v0.8b
- mov v2.16b, v6.16b
- mov v3.16b, v7.16b
-
- add x1, x8, x3, lsl #4 // in + 2 * stride
- add x2, x3, x3, lsl #1 // 3 * stride
- bl fft5_neon
- zip1 v1.4s, v1.4s, v0.4s
- mov v4.16b, v6.16b
- mov v5.16b, v7.16b
-
- mov x1, x8 // in + 0 * stride
- add x2, x3, x3, lsl #1 // 3 * stride
- bl fft5_neon
-
- faddp v20.4s, v1.4s, v1.4s
-
- ext v18.16b, v8.16b, v8.16b, #4
- ext v19.16b, v9.16b, v9.16b, #4
- mov v16.16b, v6.16b
- mov v17.16b, v7.16b
- fadd v20.2s, v20.2s, v0.2s
-
- uzp1 v18.4s, v18.4s, v10.4s // exp[2,4,6,8].re
- uzp1 v19.4s, v19.4s, v11.4s // exp[2,4,6,8].im
-
- st1 {v20.2s}, [x0], #8 // out[0]
-
- fmla v16.4s, v2.4s, v8.4s
- fmls v16.4s, v3.4s, v9.4s
-
- fmla v17.4s, v2.4s, v9.4s
- fmla v17.4s, v3.4s, v8.4s
-
- fmla v16.4s, v4.4s, v18.4s
- fmls v16.4s, v5.4s, v19.4s
-
- fmla v17.4s, v4.4s, v19.4s
- fmla v17.4s, v5.4s, v18.4s
-
- zip1 v18.4s, v16.4s, v17.4s
- zip2 v19.4s, v16.4s, v17.4s
-
- rev64 v31.4s, v14.4s
- trn1 v28.2d, v1.2d, v1.2d
- trn2 v29.2d, v1.2d, v1.2d
- zip1 v30.2d, v14.2d, v31.2d
- zip2 v31.2d, v14.2d, v31.2d
-
- st1 {v18.4s,v19.4s}, [x0], #32 // out[1-4]
-
- fmul v16.4s, v28.4s, v30.4s
- fmul v17.4s, v29.4s, v30.4s
- fmls v16.4s, v29.4s, v31.4s
- fmla v17.4s, v28.4s, v31.4s
- faddp v16.4s, v16.4s, v16.4s
- faddp v17.4s, v17.4s, v17.4s
- zip1 v18.2s, v16.2s, v17.2s
- zip2 v19.2s, v16.2s, v17.2s
-
- fadd v18.2s, v18.2s, v0.2s
- fadd v0.2s, v19.2s, v0.2s
-
- ext v30.16b, v12.16b, v12.16b, #4
- ext v31.16b, v13.16b, v13.16b, #4
- mov v16.16b, v6.16b
- mov v17.16b, v7.16b
-
- uzp1 v30.4s, v30.4s, v8.4s
- uzp1 v31.4s, v31.4s, v9.4s
-
- st1 {v18.2s}, [x0], #8 // out[5]
-
- fmla v16.4s, v2.4s, v10.4s
- fmls v16.4s, v3.4s, v11.4s
-
- fmla v17.4s, v2.4s, v11.4s
- fmla v17.4s, v3.4s, v10.4s
-
- fmla v16.4s, v4.4s, v30.4s
- fmls v16.4s, v5.4s, v31.4s
-
- fmla v17.4s, v4.4s, v31.4s
- fmla v17.4s, v5.4s, v30.4s
-
- zip1 v18.4s, v16.4s, v17.4s
- zip2 v19.4s, v16.4s, v17.4s
-
- ext v30.16b, v10.16b, v10.16b, #4
- ext v31.16b, v11.16b, v11.16b, #4
-
- fmla v6.4s, v2.4s, v12.4s
- fmls v6.4s, v3.4s, v13.4s
-
- st1 {v18.4s,v19.4s}, [x0], #32 // out[6-9]
-
- uzp1 v30.4s, v30.4s, v12.4s
- uzp1 v31.4s, v31.4s, v13.4s
-
- fmla v7.4s, v2.4s, v13.4s
- fmla v7.4s, v3.4s, v12.4s
-
- st1 {v0.2s}, [x0], #8 // out[10]
-
- fmla v6.4s, v4.4s, v30.4s
- fmls v6.4s, v5.4s, v31.4s
-
- fmla v7.4s, v4.4s, v31.4s
- fmla v7.4s, v5.4s, v30.4s
-
- zip1 v18.4s, v6.4s, v7.4s
- zip2 v19.4s, v6.4s, v7.4s
-
- st1 {v18.4s,v19.4s}, [x0], #32 // out[11-14]
-
- ret x9
-endfunc
-
-// x0: out, x1: out+len2, x2: exptab, x3: len2
-function fft15_pass
- ands x6, x3, #3
- mov x4, x0
- mov x5, x1
- b.eq 9f
- ld1 {v0.2s}, [x0], #8
- ld1 {v1.2s}, [x1], #8
- sub x3, x3, x6
- subs x6, x6, #1
- fadd v2.2s, v0.2s, v1.2s
- fsub v3.2s, v0.2s, v1.2s
- add x2, x2, #8
- st1 {v2.2s}, [x4], #8
- st1 {v3.2s}, [x5], #8
- b.eq 9f
-1:
- subs x6, x6, #1
- ldp s4, s5, [x2], #8
- ldp s2, s3, [x1], #8
- ldp s0, s1, [x0], #8
-
- fmul s6, s2, s4
- fmul s7, s2, s5
- fmls s6, s3, v5.s[0]
- fmla s7, s3, v4.s[0]
-
- fsub s2, s0, s6
- fsub s3, s1, s7
- fadd s0, s0, s6
- fadd s1, s1, s7
-
- stp s2, s3, [x5], #8
- stp s0, s1, [x4], #8
- b.gt 1b
-9:
- ld1 {v4.4s,v5.4s}, [x2], #32
- ld2 {v2.4s,v3.4s}, [x1], #32
- uzp1 v6.4s, v4.4s, v5.4s
- uzp2 v7.4s, v4.4s, v5.4s
- ld2 {v0.4s,v1.4s}, [x0], #32
-8:
- subs x3, x3, #8
-
- fmul v4.4s, v2.4s, v6.4s
- fmul v5.4s, v2.4s, v7.4s
- b.lt 4f
-
- ld1 {v18.4s,v19.4s}, [x2], #32
-
- fmls v4.4s, v3.4s, v7.4s
- fmla v5.4s, v3.4s, v6.4s
-
- ld2 {v22.4s,v23.4s}, [x1], #32
-
- fsub v2.4s, v0.4s, v4.4s
- fadd v0.4s, v0.4s, v4.4s
- fsub v3.4s, v1.4s, v5.4s
- fadd v1.4s, v1.4s, v5.4s
-
- uzp1 v16.4s, v18.4s, v19.4s
- uzp2 v17.4s, v18.4s, v19.4s
-
- st2 {v2.4s,v3.4s}, [x5], #32
- st2 {v0.4s,v1.4s}, [x4], #32
- ld2 {v20.4s,v21.4s}, [x0], #32
-
- fmul v18.4s, v22.4s, v16.4s
- fmul v19.4s, v22.4s, v17.4s
- b.eq 0f
-
- ld1 {v4.4s,v5.4s}, [x2], #32
-
- fmls v18.4s, v23.4s, v17.4s
- fmla v19.4s, v23.4s, v16.4s
-
- ld2 {v2.4s,v3.4s}, [x1], #32
-
- fsub v22.4s, v20.4s, v18.4s
- fadd v20.4s, v20.4s, v18.4s
- fsub v23.4s, v21.4s, v19.4s
- fadd v21.4s, v21.4s, v19.4s
-
- uzp1 v6.4s, v4.4s, v5.4s
- uzp2 v7.4s, v4.4s, v5.4s
-
- st2 {v22.4s,v23.4s}, [x5], #32
- st2 {v20.4s,v21.4s}, [x4], #32
- ld2 {v0.4s,v1.4s}, [x0], #32
-
- b 8b
-4:
- fmls v4.4s, v3.4s, v7.4s
- fmla v5.4s, v3.4s, v6.4s
-
- fsub v2.4s, v0.4s, v4.4s
- fadd v0.4s, v0.4s, v4.4s
- fsub v3.4s, v1.4s, v5.4s
- fadd v1.4s, v1.4s, v5.4s
-
- st2 {v2.4s,v3.4s}, [x5], #32
- st2 {v0.4s,v1.4s}, [x4], #32
-
- ret
-0:
- fmls v18.4s, v23.4s, v17.4s
- fmla v19.4s, v23.4s, v16.4s
-
- fsub v22.4s, v20.4s, v18.4s
- fadd v20.4s, v20.4s, v18.4s
- fsub v23.4s, v21.4s, v19.4s
- fadd v21.4s, v21.4s, v19.4s
-
- st2 {v22.4s,v23.4s}, [x5], #32
- st2 {v20.4s,v21.4s}, [x4], #32
-
- ret
-endfunc
-
-function fft30_neon, align=6
- sub sp, sp, #0x20
- stp x20, x21, [sp]
- stp x22, x30, [sp, #0x10]
- mov x21, x1
- mov x22, x2
- mov x20, x4
- mov x0, x21
- mov x1, x22
- lsl x3, x20, #1
- bl fft15_neon
-
- add x0, x21, #15*8
- add x1, x22, x20, lsl #3
- lsl x3, x20, #1
- bl fft15_neon
-
- ldr x2, [x10, #(CELT_EXPTAB + 8)] // s->exptab[1]
- add x0, x21, #0
- add x1, x21, #15*8
- mov x3, #15
- ldp x20, x21, [sp]
- ldp x22, x30, [sp, #0x10]
- add sp, sp, #0x20
- b fft15_pass
-endfunc
-
-.macro def_fft n, n2
-function fft\n\()_neon, align=6
- sub sp, sp, #0x30
- stp x20, x21, [sp]
- stp x22, x30, [sp, #0x10]
- stp x23, x24, [sp, #0x20]
- mov x21, x1
- mov x22, x2
- mov x23, x3
- mov x20, x4
- sub x3, x3, #1
- lsl x4, x4, #1
- bl fft\n2\()_neon
-
- add x1, x21, #(\n2 * 8)
- add x2, x22, x20, lsl #3
- sub x3, x23, #1
- lsl x4, x20, #1
- bl fft\n2\()_neon
-
- add x5, x10, #CELT_EXPTAB
- mov x0, x21
- ldr x2, [x5, x23, lsl #3] // s->exptab[N]
- add x1, x21, #(\n2 * 8)
- mov x3, #\n2
- ldp x20, x21, [sp]
- ldp x22, x30, [sp, #0x10]
- ldp x23, x24, [sp, #0x20]
- add sp, sp, #0x30
- b fft15_pass
-endfunc
-.endm
-
- def_fft 60, 30
- def_fft 120, 60
- def_fft 240, 120
- def_fft 480, 240
- def_fft 960, 480
-
-function fft_b15_calc_neon
- sub sp, sp, #0x50
- ldr x8, [x0, #CELT_EXPTAB] // s->exptab[0]
- movrel x6, fact5
- movrel x11, shuffle_0213
- movrel x12, shuffle_1032
- movrel x13, shuffle_2301
- movrel x14, shuffle_3120
- add x8, x8, #8
- movrel x5, fft_tab_neon
- stp x20, x30, [sp]
- stp d8, d9, [sp, #0x10]
- stp d10, d11, [sp, #0x20]
- stp d12, d13, [sp, #0x30]
- stp d14, d15, [sp, #0x40]
- ld1 {v15.4s}, [x6]
- ld1 {v0.4s,v1.4s}, [x8], #32
- ld1 {v6.2s}, [x8], #8
- ld1 {v2.4s,v3.4s}, [x8], #32
- ld1 {v7.2s}, [x8], #8
- ld1 {v4.4s,v5.4s}, [x8], #32
- uzp1 v8.4s, v0.4s, v1.4s // exp[ 1 - 4].re
- uzp2 v9.4s, v0.4s, v1.4s // exp[ 1 - 4].im
- uzp1 v10.4s, v2.4s, v3.4s // exp[ 6 - 9].re
- uzp2 v11.4s, v2.4s, v3.4s // exp[ 6 - 9].im
- uzp1 v12.4s, v4.4s, v5.4s // exp[11 - 14].re
- uzp2 v13.4s, v4.4s, v5.4s // exp[11 - 14].im
- zip1 v14.4s, v6.4s, v7.4s // exp[5,10].re/exp[5,10].im
- add x5, x5, x3, lsl #3
- ldr x5, [x5]
- mov x10, x0
- blr x5
- ldp x20, x30, [sp]
- ldp d8, d9, [sp, #0x10]
- ldp d10, d11, [sp, #0x20]
- ldp d12, d13, [sp, #0x30]
- ldp d14, d15, [sp, #0x40]
- add sp, sp, #0x50
- ret
-endfunc
-
-const fft_tab_neon, relocate=1
- .quad fft15_neon
- .quad fft30_neon
- .quad fft60_neon
- .quad fft120_neon
- .quad fft240_neon
- .quad fft480_neon
- .quad fft960_neon
-endconst
-
-function ff_celt_imdct_half_neon, export=1
- sub sp, sp, #0x20
- stp x21, x30, [sp]
- str s0, [sp, #0x10]
-
- ldp w5, w6, [x0, #CELT_LEN2] // CELT_LEN4
- mov x10, x0
- mov x21, x1
- sub w5, w5, #1
- lsl x7, x3, #3 // 2 * stride * sizeof(float)
- sub x8, xzr, x3, lsl #3 // -2 * stride * sizeof(float)
- mul x5, x5, x3
- ldp x9, x10, [x0, #CELT_TMP] // CELT_TWIDDLE
- ldr w3, [x0, #CELT_FFT_N]
- add x5, x2, x5, lsl #2
- mov x11, x9
-
- sub w6, w6, #4
- ld1 {v0.s}[0], [x5], x8
- ld1 {v1.s}[0], [x2], x7
- ld1 {v4.4s,v5.4s}, [x10], #32
- ld1 {v0.s}[1], [x5], x8
- ld1 {v1.s}[1], [x2], x7
- uzp1 v2.4s, v4.4s, v5.4s
- ld1 {v0.s}[2], [x5], x8
- ld1 {v1.s}[2], [x2], x7
- uzp2 v3.4s, v4.4s, v5.4s
- ld1 {v0.s}[3], [x5], x8
- ld1 {v1.s}[3], [x2], x7
-1:
- subs w6, w6, #4
-
- ld1 {v20.s}[0], [x5], x8
- ld1 {v21.s}[0], [x2], x7
- ld1 {v4.4s,v5.4s}, [x10], #32
-
- fmul v6.4s, v0.4s, v2.4s
- fmul v7.4s, v0.4s, v3.4s
-
- ld1 {v20.s}[1], [x5], x8
- ld1 {v21.s}[1], [x2], x7
-
- fmls v6.4s, v1.4s, v3.4s
- fmla v7.4s, v1.4s, v2.4s
-
- ld1 {v20.s}[2], [x5], x8
- ld1 {v21.s}[2], [x2], x7
-
- uzp1 v2.4s, v4.4s, v5.4s
- uzp2 v3.4s, v4.4s, v5.4s
- ld1 {v20.s}[3], [x5], x8
- ld1 {v21.s}[3], [x2], x7
-
- zip1 v4.4s, v6.4s, v7.4s
- zip2 v5.4s, v6.4s, v7.4s
-
- fmul v6.4s, v20.4s, v2.4s
- fmul v7.4s, v20.4s, v3.4s
-
- st1 {v4.4s,v5.4s}, [x9], #32
-
- fmls v6.4s, v21.4s, v3.4s
- fmla v7.4s, v21.4s, v2.4s
-
- b.eq 3f
-
- subs w6, w6, #4
- ld1 {v4.4s,v5.4s}, [x10], #32
- ld1 {v0.s}[0], [x5], x8
- ld1 {v1.s}[0], [x2], x7
- uzp1 v2.4s, v4.4s, v5.4s
- ld1 {v0.s}[1], [x5], x8
- ld1 {v1.s}[1], [x2], x7
- uzp2 v3.4s, v4.4s, v5.4s
- ld1 {v0.s}[2], [x5], x8
- ld1 {v1.s}[2], [x2], x7
- zip1 v4.4s, v6.4s, v7.4s
- zip2 v5.4s, v6.4s, v7.4s
- ld1 {v0.s}[3], [x5], x8
- ld1 {v1.s}[3], [x2], x7
-
- st1 {v4.4s,v5.4s}, [x9], #32
-
- b.gt 1b
-
- fmul v6.4s, v0.4s, v2.4s
- fmul v7.4s, v0.4s, v3.4s
- fmls v6.4s, v1.4s, v3.4s
- fmla v7.4s, v1.4s, v2.4s
-3:
- zip1 v4.4s, v6.4s, v7.4s
- zip2 v5.4s, v6.4s, v7.4s
- st1 {v4.4s,v5.4s}, [x9], #32
-
- mov x2, x11
- mov x4, #1
-
- bl fft_b15_calc_neon
-
- ldr w5, [x10, #CELT_LEN4]
- ldr x6, [x10, #CELT_TWIDDLE]
- ldr s31, [sp, #0x10]
-
- add x1, x21, x5, lsl #2
- add x3, x6, x5, lsl #2
- sub x0, x1, #16
- sub x2, x3, #16
- mov x8, #-16
- mov x7, #16
- mov x10, x0
- mov x11, x1
-
- sub w5, w5, #4
-
- ld1 {v0.4s}, [x0], x8
- ld1 {v1.4s}, [x1], x7
- ld1 {v2.4s}, [x2], x8
- ld1 {v3.4s}, [x3], x7
-
- uzp1 v4.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].re
- uzp2 v6.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].im
-
- uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re
- uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im
-
- fmul v1.4s, v6.4s, v5.4s
- fmul v0.4s, v6.4s, v7.4s
-2:
- subs w5, w5, #4
-
- ld1 {v20.4s}, [x0], x8
-
- fmla v1.4s, v4.4s, v7.4s
- fmls v0.4s, v4.4s, v5.4s
-
- ld1 {v21.4s}, [x1], x7
-
- ext v1.16b, v1.16b, v1.16b, #8
- fmul v0.4s, v0.4s, v31.s[0]
-
- ld1 {v2.4s}, [x2], x8
-
- rev64 v1.4s, v1.4s
- fmul v1.4s, v1.4s, v31.s[0]
-
- ld1 {v3.4s}, [x3], x7
-
- zip1 v5.4s, v0.4s, v1.4s
- zip2 v7.4s, v0.4s, v1.4s
-
- uzp1 v4.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].re
- uzp2 v6.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].im
-
- st1 {v5.4s}, [x10], x8
- st1 {v7.4s}, [x11], x7
-
- uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re
- uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im
-
- fmul v1.4s, v6.4s, v5.4s
- fmul v0.4s, v6.4s, v7.4s
- b.gt 2b
-
- fmla v1.4s, v4.4s, v7.4s
- fmls v0.4s, v4.4s, v5.4s
- ext v1.16b, v1.16b, v1.16b, #8
- fmul v0.4s, v0.4s, v31.s[0]
- rev64 v1.4s, v1.4s
- fmul v1.4s, v1.4s, v31.s[0]
- zip1 v5.4s, v0.4s, v1.4s
- zip2 v7.4s, v0.4s, v1.4s
- st1 {v5.4s}, [x10], x8
- st1 {v7.4s}, [x11], x7
-
- ldp x21, x30, [sp]
- add sp, sp, #0x20
- ret
-endfunc
-
-// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
-const fact5, align=4
- .float 0.30901699437494745, 0.95105651629515353
- .float -0.80901699437494734, 0.58778525229247325
-endconst
diff --git a/libavcodec/aarch64/mdct_init.c b/libavcodec/aarch64/mdct_init.c
deleted file mode 100644
index 816111ab63..0000000000
--- a/libavcodec/aarch64/mdct_init.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/aarch64/cpu.h"
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-av_cold void ff_mdct_init_aarch64(FFTContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- s->imdct_calc = ff_imdct_calc_neon;
- s->imdct_half = ff_imdct_half_neon;
- s->mdct_calc = ff_mdct_calc_neon;
- s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
- }
-}
diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S
index bccd8323fd..1fd199c972 100644
--- a/libavcodec/aarch64/mdct_neon.S
+++ b/libavcodec/aarch64/mdct_neon.S
@@ -3,20 +3,20 @@
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/mpegaudiodsp_init.c b/libavcodec/aarch64/mpegaudiodsp_init.c
index 849e310f62..5d966af5f4 100644
--- a/libavcodec/aarch64/mpegaudiodsp_init.c
+++ b/libavcodec/aarch64/mpegaudiodsp_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S b/libavcodec/aarch64/mpegaudiodsp_neon.S
index 2a36f67603..b6ef131228 100644
--- a/libavcodec/aarch64/mpegaudiodsp_neon.S
+++ b/libavcodec/aarch64/mpegaudiodsp_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 377009e244..0fddbecae3 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/neontest.c b/libavcodec/aarch64/neontest.c
index 201bfb1ce7..a24c22dd30 100644
--- a/libavcodec/aarch64/neontest.c
+++ b/libavcodec/aarch64/neontest.c
@@ -2,20 +2,20 @@
* check NEON registers for clobbers
* Copyright (c) 2013 Martin Storsjo
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/rv40dsp_init_aarch64.c b/libavcodec/aarch64/rv40dsp_init_aarch64.c
index f7fcd5b493..142705db98 100644
--- a/libavcodec/aarch64/rv40dsp_init_aarch64.c
+++ b/libavcodec/aarch64/rv40dsp_init_aarch64.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/sbrdsp_init_aarch64.c b/libavcodec/aarch64/sbrdsp_init_aarch64.c
new file mode 100644
index 0000000000..9c967990df
--- /dev/null
+++ b/libavcodec/aarch64/sbrdsp_init_aarch64.c
@@ -0,0 +1,70 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/sbrdsp.h"
+
+void ff_sbr_sum64x5_neon(float *z);
+float ff_sbr_sum_square_neon(float (*x)[2], int n);
+void ff_sbr_neg_odd_64_neon(float *x);
+void ff_sbr_qmf_pre_shuffle_neon(float *z);
+void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
+void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
+void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
+ const float *g_filt, int m_max, intptr_t ixh);
+void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
+ const float alpha0[2], const float alpha1[2],
+ float bw, int start, int end);
+void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
+void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+
+av_cold void ff_sbrdsp_init_aarch64(SBRDSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ s->sum64x5 = ff_sbr_sum64x5_neon;
+ s->sum_square = ff_sbr_sum_square_neon;
+ s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
+ s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
+ s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
+ s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
+ s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
+ s->hf_g_filt = ff_sbr_hf_g_filt_neon;
+ s->hf_gen = ff_sbr_hf_gen_neon;
+ s->autocorrelate = ff_sbr_autocorrelate_neon;
+ s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
+ s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
+ s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
+ s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
+ }
+}
diff --git a/libavcodec/aarch64/sbrdsp_neon.S b/libavcodec/aarch64/sbrdsp_neon.S
new file mode 100644
index 0000000000..d23717e760
--- /dev/null
+++ b/libavcodec/aarch64/sbrdsp_neon.S
@@ -0,0 +1,327 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+const factors, align=4
+ .float 1.0, -1.0, 1.0, -1.0
+endconst
+
+const phi_noise_0, align=4
+ .float 1.0, 0.0, 1.0, 0.0
+endconst
+
+const phi_noise_1, align=4
+ .float 0.0, 1.0, 0.0, -1.0
+ .float 0.0, -1.0, 0.0, 1.0
+endconst
+
+const phi_noise_2, align=4
+ .float -1.0, 0.0, -1.0, 0.0
+endconst
+
+const phi_noise_3, align=4
+ .float 0.0, -1.0, 0.0, 1.0
+ .float 0.0, 1.0, 0.0, -1.0
+endconst
+
+function ff_sbr_sum64x5_neon, export=1
+ add x1, x0, #64*4
+ add x2, x0, #128*4
+ add x3, x0, #192*4
+ add x4, x0, #256*4
+ mov x5, #64
+1: ld1 {v0.4S}, [x0]
+ ld1 {v1.4S}, [x1], #16
+ fadd v0.4S, v0.4S, v1.4S
+ ld1 {v2.4S}, [x2], #16
+ fadd v0.4S, v0.4S, v2.4S
+ ld1 {v3.4S}, [x3], #16
+ fadd v0.4S, v0.4S, v3.4S
+ ld1 {v4.4S}, [x4], #16
+ fadd v0.4S, v0.4S, v4.4S
+ st1 {v0.4S}, [x0], #16
+ subs x5, x5, #4
+ b.gt 1b
+ ret
+endfunc
+
+function ff_sbr_sum_square_neon, export=1
+ movi v0.4S, #0
+1: ld1 {v1.4S}, [x0], #16
+ fmla v0.4S, v1.4S, v1.4S
+ subs w1, w1, #2
+ b.gt 1b
+ faddp v0.4S, v0.4S, v0.4S
+ faddp v0.4S, v0.4S, v0.4S
+ ret
+endfunc
+
+function ff_sbr_neg_odd_64_neon, export=1
+ mov x1, x0
+ movi v5.4S, #1<<7, lsl #24
+ ld2 {v0.4S, v1.4S}, [x0], #32
+ eor v1.16B, v1.16B, v5.16B
+ ld2 {v2.4S, v3.4S}, [x0], #32
+.rept 3
+ st2 {v0.4S, v1.4S}, [x1], #32
+ eor v3.16B, v3.16B, v5.16B
+ ld2 {v0.4S, v1.4S}, [x0], #32
+ st2 {v2.4S, v3.4S}, [x1], #32
+ eor v1.16B, v1.16B, v5.16B
+ ld2 {v2.4S, v3.4S}, [x0], #32
+.endr
+ eor v3.16B, v3.16B, v5.16B
+ st2 {v0.4S, v1.4S}, [x1], #32
+ st2 {v2.4S, v3.4S}, [x1], #32
+ ret
+endfunc
+
+function ff_sbr_qmf_pre_shuffle_neon, export=1
+ add x1, x0, #60*4
+ add x2, x0, #64*4
+ mov x3, #-16
+ mov x4, #-4
+ movi v6.4S, #1<<7, lsl #24
+ ld1 {v0.2S}, [x0], #8
+ st1 {v0.2S}, [x2], #8
+.rept 7
+ ld1 {v1.4S}, [x1], x3
+ ld1 {v2.4S}, [x0], #16
+ eor v1.16B, v1.16B, v6.16B
+ rev64 v1.4S, v1.4S
+ ext v1.16B, v1.16B, v1.16B, #8
+ st2 {v1.4S, v2.4S}, [x2], #32
+.endr
+ add x1, x1, #8
+ ld1 {v1.2S}, [x1], x4
+ ld1 {v2.2S}, [x0], #8
+ ld1 {v1.S}[3], [x1]
+ ld1 {v2.S}[2], [x0]
+ eor v1.16B, v1.16B, v6.16B
+ rev64 v1.4S, v1.4S
+ st2 {v1.2S, v2.2S}, [x2], #16
+ st2 {v1.S, v2.S}[2], [x2]
+ ret
+endfunc
+
+function ff_sbr_qmf_post_shuffle_neon, export=1
+ add x2, x1, #60*4
+ mov x3, #-16
+ mov x4, #32
+ movi v6.4S, #1<<7, lsl #24
+1: ld1 {v0.4S}, [x2], x3
+ ld1 {v1.4S}, [x1], #16
+ eor v0.16B, v0.16B, v6.16B
+ rev64 v0.4S, v0.4S
+ ext v0.16B, v0.16B, v0.16B, #8
+ st2 {v0.4S, v1.4S}, [x0], #32
+ subs x4, x4, #4
+ b.gt 1b
+ ret
+endfunc
+
+function ff_sbr_qmf_deint_neg_neon, export=1
+ add x1, x1, #56*4
+ add x2, x0, #60*4
+ mov x3, #-32
+ mov x4, #32
+ movi v2.4S, #1<<7, lsl #24
+1: ld2 {v0.4S, v1.4S}, [x1], x3
+ eor v0.16B, v0.16B, v2.16B
+ rev64 v1.4S, v1.4S
+ ext v1.16B, v1.16B, v1.16B, #8
+ st1 {v0.4S}, [x2]
+ st1 {v1.4S}, [x0], #16
+ sub x2, x2, #16
+ subs x4, x4, #4
+ b.gt 1b
+ ret
+endfunc
+
+function ff_sbr_qmf_deint_bfly_neon, export=1
+ add x2, x2, #60*4
+ add x3, x0, #124*4
+ mov x4, #64
+ mov x5, #-16
+1: ld1 {v0.4S}, [x1], #16
+ ld1 {v1.4S}, [x2], x5
+ rev64 v2.4S, v0.4S
+ ext v2.16B, v2.16B, v2.16B, #8
+ rev64 v3.4S, v1.4S
+ ext v3.16B, v3.16B, v3.16B, #8
+ fadd v1.4S, v1.4S, v2.4S
+ fsub v0.4S, v0.4S, v3.4S
+ st1 {v0.4S}, [x0], #16
+ st1 {v1.4S}, [x3], x5
+ subs x4, x4, #4
+ b.gt 1b
+ ret
+endfunc
+
+function ff_sbr_hf_gen_neon, export=1
+ sxtw x4, w4
+ sxtw x5, w5
+ movrel x6, factors
+ ld1 {v7.4S}, [x6]
+ dup v1.4S, v0.S[0]
+ mov v2.8B, v1.8B
+ mov v2.S[2], v7.S[0]
+ mov v2.S[3], v7.S[0]
+ fmul v1.4S, v1.4S, v2.4S
+ ld1 {v0.D}[0], [x3]
+ ld1 {v0.D}[1], [x2]
+ fmul v0.4S, v0.4S, v1.4S
+ fmul v1.4S, v0.4S, v7.4S
+ rev64 v0.4S, v0.4S
+ sub x7, x5, x4
+ add x0, x0, x4, lsl #3
+ add x1, x1, x4, lsl #3
+ sub x1, x1, #16
+1: ld1 {v2.4S}, [x1], #16
+ ld1 {v3.2S}, [x1]
+ fmul v4.4S, v2.4S, v1.4S
+ fmul v5.4S, v2.4S, v0.4S
+ faddp v4.4S, v4.4S, v4.4S
+ faddp v5.4S, v5.4S, v5.4S
+ faddp v4.4S, v4.4S, v4.4S
+ faddp v5.4S, v5.4S, v5.4S
+ mov v4.S[1], v5.S[0]
+ fadd v4.2S, v4.2S, v3.2S
+ st1 {v4.2S}, [x0], #8
+ sub x1, x1, #8
+ subs x7, x7, #1
+ b.gt 1b
+ ret
+endfunc
+
+function ff_sbr_hf_g_filt_neon, export=1
+ sxtw x3, w3
+ sxtw x4, w4
+ mov x5, #40*2*4
+ add x1, x1, x4, lsl #3
+1: ld1 {v0.2S}, [x1], x5
+ ld1 {v1.S}[0], [x2], #4
+ fmul v2.4S, v0.4S, v1.S[0]
+ st1 {v2.2S}, [x0], #8
+ subs x3, x3, #1
+ b.gt 1b
+ ret
+endfunc
+
+function ff_sbr_autocorrelate_neon, export=1
+ mov x2, #38
+ movrel x3, factors
+ ld1 {v0.4S}, [x3]
+ movi v1.4S, #0
+ movi v2.4S, #0
+ movi v3.4S, #0
+ ld1 {v4.2S}, [x0], #8
+ ld1 {v5.2S}, [x0], #8
+ fmul v16.2S, v4.2S, v4.2S
+ fmul v17.2S, v5.2S, v4.S[0]
+ fmul v18.2S, v5.2S, v4.S[1]
+1: ld1 {v5.D}[1], [x0], #8
+ fmla v1.2S, v4.2S, v4.2S
+ fmla v2.4S, v5.4S, v4.S[0]
+ fmla v3.4S, v5.4S, v4.S[1]
+ mov v4.D[0], v5.D[0]
+ mov v5.D[0], v5.D[1]
+ subs x2, x2, #1
+ b.gt 1b
+ fmul v19.2S, v4.2S, v4.2S
+ fmul v20.2S, v5.2S, v4.S[0]
+ fmul v21.2S, v5.2S, v4.S[1]
+ fadd v22.4S, v2.4S, v20.4S
+ fsub v22.4S, v22.4S, v17.4S
+ fadd v23.4S, v3.4S, v21.4S
+ fsub v23.4S, v23.4S, v18.4S
+ rev64 v23.4S, v23.4S
+ fmul v23.4S, v23.4S, v0.4S
+ fadd v22.4S, v22.4S, v23.4S
+ st1 {v22.4S}, [x1], #16
+ fadd v23.2S, v1.2S, v19.2S
+ fsub v23.2S, v23.2S, v16.2S
+ faddp v23.2S, v23.2S, v23.2S
+ st1 {v23.S}[0], [x1]
+ add x1, x1, #8
+ rev64 v3.2S, v3.2S
+ fmul v3.2S, v3.2S, v0.2S
+ fadd v2.2S, v2.2S, v3.2S
+ st1 {v2.2S}, [x1]
+ add x1, x1, #16
+ faddp v1.2S, v1.2S, v1.2S
+ st1 {v1.S}[0], [x1]
+ ret
+endfunc
+
+.macro apply_noise_common
+ sxtw x3, w3
+ sxtw x5, w5
+ movrel x7, X(ff_sbr_noise_table)
+ add x3, x3, #1
+1: and x3, x3, #0x1ff
+ add x8, x7, x3, lsl #3
+ add x3, x3, #2
+ ld1 {v2.4S}, [x0]
+ ld1 {v3.2S}, [x1], #8
+ ld1 {v4.2S}, [x2], #8
+ ld1 {v5.4S}, [x8]
+ mov v6.16B, v2.16B
+ zip1 v3.4S, v3.4S, v3.4S
+ zip1 v4.4S, v4.4S, v4.4S
+ fmla v6.4S, v1.4S, v3.4S
+ fmla v2.4S, v5.4S, v4.4S
+ fcmeq v7.4S, v3.4S, #0
+ bif v2.16B, v6.16B, v7.16B
+ st1 {v2.4S}, [x0], #16
+ subs x5, x5, #2
+ b.gt 1b
+.endm
+
+function ff_sbr_hf_apply_noise_0_neon, export=1
+ movrel x9, phi_noise_0
+ ld1 {v1.4S}, [x9]
+ apply_noise_common
+ ret
+endfunc
+
+function ff_sbr_hf_apply_noise_1_neon, export=1
+ movrel x9, phi_noise_1
+ and x4, x4, #1
+ add x9, x9, x4, lsl #4
+ ld1 {v1.4S}, [x9]
+ apply_noise_common
+ ret
+endfunc
+
+function ff_sbr_hf_apply_noise_2_neon, export=1
+ movrel x9, phi_noise_2
+ ld1 {v1.4S}, [x9]
+ apply_noise_common
+ ret
+endfunc
+
+function ff_sbr_hf_apply_noise_3_neon, export=1
+ movrel x9, phi_noise_3
+ and x4, x4, #1
+ add x9, x9, x4, lsl #4
+ ld1 {v1.4S}, [x9]
+ apply_noise_common
+ ret
+endfunc
diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S
new file mode 100644
index 0000000000..5e4d021a97
--- /dev/null
+++ b/libavcodec/aarch64/simple_idct_neon.S
@@ -0,0 +1,362 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4c ((1<<(COL_SHIFT-1))/Z4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define z1 v0.H[0]
+#define z2 v0.H[1]
+#define z3 v0.H[2]
+#define z4 v0.H[3]
+#define z5 v0.H[4]
+#define z6 v0.H[5]
+#define z7 v0.H[6]
+#define z4c v0.H[7]
+
+const idct_coeff_neon, align=4
+ .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
+endconst
+
+.macro idct_start data
+ prfm pldl1keep, [\data]
+ mov x10, x30
+ movrel x3, idct_coeff_neon
+ ld1 {v0.2D}, [x3]
+.endm
+
+.macro idct_end
+ br x10
+.endm
+
+.macro smull1 a, b, c
+ smull \a, \b, \c
+.endm
+
+.macro smlal1 a, b, c
+ smlal \a, \b, \c
+.endm
+
+.macro smlsl1 a, b, c
+ smlsl \a, \b, \c
+.endm
+
+.macro idct_col4_top y1, y2, y3, y4, i, l
+ smull\i v7.4S, \y3\l, z2
+ smull\i v16.4S, \y3\l, z6
+ smull\i v17.4S, \y2\l, z1
+ add v19.4S, v23.4S, v7.4S
+ smull\i v18.4S, \y2\l, z3
+ add v20.4S, v23.4S, v16.4S
+ smull\i v5.4S, \y2\l, z5
+ sub v21.4S, v23.4S, v16.4S
+ smull\i v6.4S, \y2\l, z7
+ sub v22.4S, v23.4S, v7.4S
+
+ smlal\i v17.4S, \y4\l, z3
+ smlsl\i v18.4S, \y4\l, z7
+ smlsl\i v5.4S, \y4\l, z1
+ smlsl\i v6.4S, \y4\l, z5
+.endm
+
+.macro idct_row4_neon y1, y2, y3, y4, pass
+ ld1 {\y1\().2D,\y2\().2D}, [x2], #32
+ movi v23.4S, #1<<2, lsl #8
+ orr v5.16B, \y1\().16B, \y2\().16B
+ ld1 {\y3\().2D,\y4\().2D}, [x2], #32
+ orr v6.16B, \y3\().16B, \y4\().16B
+ orr v5.16B, v5.16B, v6.16B
+ mov x3, v5.D[1]
+ smlal v23.4S, \y1\().4H, z4
+
+ idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
+
+ cmp x3, #0
+ b.eq \pass\()f
+
+ smull2 v7.4S, \y1\().8H, z4
+ smlal2 v17.4S, \y2\().8H, z5
+ smlsl2 v18.4S, \y2\().8H, z1
+ smull2 v16.4S, \y3\().8H, z2
+ smlal2 v5.4S, \y2\().8H, z7
+ add v19.4S, v19.4S, v7.4S
+ sub v20.4S, v20.4S, v7.4S
+ sub v21.4S, v21.4S, v7.4S
+ add v22.4S, v22.4S, v7.4S
+ smlal2 v6.4S, \y2\().8H, z3
+ smull2 v7.4S, \y3\().8H, z6
+ smlal2 v17.4S, \y4\().8H, z7
+ smlsl2 v18.4S, \y4\().8H, z5
+ smlal2 v5.4S, \y4\().8H, z3
+ smlsl2 v6.4S, \y4\().8H, z1
+ add v19.4S, v19.4S, v7.4S
+ sub v20.4S, v20.4S, v16.4S
+ add v21.4S, v21.4S, v16.4S
+ sub v22.4S, v22.4S, v7.4S
+
+\pass: add \y3\().4S, v19.4S, v17.4S
+ add \y4\().4S, v20.4S, v18.4S
+ shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
+ shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
+ add v7.4S, v21.4S, v5.4S
+ add v16.4S, v22.4S, v6.4S
+ shrn \y3\().4H, v7.4S, #ROW_SHIFT
+ shrn \y4\().4H, v16.4S, #ROW_SHIFT
+ sub v22.4S, v22.4S, v6.4S
+ sub v19.4S, v19.4S, v17.4S
+ sub v21.4S, v21.4S, v5.4S
+ shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
+ sub v20.4S, v20.4S, v18.4S
+ shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
+ shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
+ shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
+
+ trn1 v16.8H, \y1\().8H, \y2\().8H
+ trn2 v17.8H, \y1\().8H, \y2\().8H
+ trn1 v18.8H, \y3\().8H, \y4\().8H
+ trn2 v19.8H, \y3\().8H, \y4\().8H
+ trn1 \y1\().4S, v16.4S, v18.4S
+ trn1 \y2\().4S, v17.4S, v19.4S
+ trn2 \y3\().4S, v16.4S, v18.4S
+ trn2 \y4\().4S, v17.4S, v19.4S
+.endm
+
+.macro declare_idct_col4_neon i, l
+function idct_col4_neon\i
+ dup v23.4H, z4c
+.if \i == 1
+ add v23.4H, v23.4H, v24.4H
+.else
+ mov v5.D[0], v24.D[1]
+ add v23.4H, v23.4H, v5.4H
+.endif
+ smull v23.4S, v23.4H, z4
+
+ idct_col4_top v24, v25, v26, v27, \i, \l
+
+ mov x4, v28.D[\i - 1]
+ mov x5, v29.D[\i - 1]
+ cmp x4, #0
+ b.eq 1f
+
+ smull\i v7.4S, v28\l, z4
+ add v19.4S, v19.4S, v7.4S
+ sub v20.4S, v20.4S, v7.4S
+ sub v21.4S, v21.4S, v7.4S
+ add v22.4S, v22.4S, v7.4S
+
+1: mov x4, v30.D[\i - 1]
+ cmp x5, #0
+ b.eq 2f
+
+ smlal\i v17.4S, v29\l, z5
+ smlsl\i v18.4S, v29\l, z1
+ smlal\i v5.4S, v29\l, z7
+ smlal\i v6.4S, v29\l, z3
+
+2: mov x5, v31.D[\i - 1]
+ cmp x4, #0
+ b.eq 3f
+
+ smull\i v7.4S, v30\l, z6
+ smull\i v16.4S, v30\l, z2
+ add v19.4S, v19.4S, v7.4S
+ sub v22.4S, v22.4S, v7.4S
+ sub v20.4S, v20.4S, v16.4S
+ add v21.4S, v21.4S, v16.4S
+
+3: cmp x5, #0
+ b.eq 4f
+
+ smlal\i v17.4S, v31\l, z7
+ smlsl\i v18.4S, v31\l, z5
+ smlal\i v5.4S, v31\l, z3
+ smlsl\i v6.4S, v31\l, z1
+
+4: addhn v7.4H, v19.4S, v17.4S
+ addhn2 v7.8H, v20.4S, v18.4S
+ subhn v18.4H, v20.4S, v18.4S
+ subhn2 v18.8H, v19.4S, v17.4S
+
+ addhn v16.4H, v21.4S, v5.4S
+ addhn2 v16.8H, v22.4S, v6.4S
+ subhn v17.4H, v22.4S, v6.4S
+ subhn2 v17.8H, v21.4S, v5.4S
+
+ ret
+endfunc
+.endm
+
+declare_idct_col4_neon 1, .4H
+declare_idct_col4_neon 2, .8H
+
+function ff_simple_idct_put_neon, export=1
+ idct_start x2
+
+ idct_row4_neon v24, v25, v26, v27, 1
+ idct_row4_neon v28, v29, v30, v31, 2
+ bl idct_col4_neon1
+
+ sqshrun v1.8B, v7.8H, #COL_SHIFT-16
+ sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
+ sqshrun v3.8B, v17.8H, #COL_SHIFT-16
+ sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
+
+ bl idct_col4_neon2
+
+ sqshrun v2.8B, v7.8H, #COL_SHIFT-16
+ sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
+ sqshrun v4.8B, v17.8H, #COL_SHIFT-16
+ sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
+
+ zip1 v16.4S, v1.4S, v2.4S
+ zip2 v17.4S, v1.4S, v2.4S
+
+ st1 {v16.D}[0], [x0], x1
+ st1 {v16.D}[1], [x0], x1
+
+ zip1 v18.4S, v3.4S, v4.4S
+ zip2 v19.4S, v3.4S, v4.4S
+
+ st1 {v17.D}[0], [x0], x1
+ st1 {v17.D}[1], [x0], x1
+ st1 {v18.D}[0], [x0], x1
+ st1 {v18.D}[1], [x0], x1
+ st1 {v19.D}[0], [x0], x1
+ st1 {v19.D}[1], [x0], x1
+
+ idct_end
+endfunc
+
+function ff_simple_idct_add_neon, export=1
+ idct_start x2
+
+ idct_row4_neon v24, v25, v26, v27, 1
+ idct_row4_neon v28, v29, v30, v31, 2
+ bl idct_col4_neon1
+
+ sshr v1.8H, v7.8H, #COL_SHIFT-16
+ sshr v2.8H, v16.8H, #COL_SHIFT-16
+ sshr v3.8H, v17.8H, #COL_SHIFT-16
+ sshr v4.8H, v18.8H, #COL_SHIFT-16
+
+ bl idct_col4_neon2
+
+ sshr v7.8H, v7.8H, #COL_SHIFT-16
+ sshr v16.8H, v16.8H, #COL_SHIFT-16
+ sshr v17.8H, v17.8H, #COL_SHIFT-16
+ sshr v18.8H, v18.8H, #COL_SHIFT-16
+
+ mov x9, x0
+ ld1 {v19.D}[0], [x0], x1
+ zip1 v23.2D, v1.2D, v7.2D
+ zip2 v24.2D, v1.2D, v7.2D
+ ld1 {v19.D}[1], [x0], x1
+ zip1 v25.2D, v2.2D, v16.2D
+ zip2 v26.2D, v2.2D, v16.2D
+ ld1 {v20.D}[0], [x0], x1
+ zip1 v27.2D, v3.2D, v17.2D
+ zip2 v28.2D, v3.2D, v17.2D
+ ld1 {v20.D}[1], [x0], x1
+ zip1 v29.2D, v4.2D, v18.2D
+ zip2 v30.2D, v4.2D, v18.2D
+ ld1 {v21.D}[0], [x0], x1
+ uaddw v23.8H, v23.8H, v19.8B
+ uaddw2 v24.8H, v24.8H, v19.16B
+ ld1 {v21.D}[1], [x0], x1
+ sqxtun v23.8B, v23.8H
+ sqxtun2 v23.16B, v24.8H
+ ld1 {v22.D}[0], [x0], x1
+ uaddw v24.8H, v25.8H, v20.8B
+ uaddw2 v25.8H, v26.8H, v20.16B
+ ld1 {v22.D}[1], [x0], x1
+ sqxtun v24.8B, v24.8H
+ sqxtun2 v24.16B, v25.8H
+ st1 {v23.D}[0], [x9], x1
+ uaddw v25.8H, v27.8H, v21.8B
+ uaddw2 v26.8H, v28.8H, v21.16B
+ st1 {v23.D}[1], [x9], x1
+ sqxtun v25.8B, v25.8H
+ sqxtun2 v25.16B, v26.8H
+ st1 {v24.D}[0], [x9], x1
+ uaddw v26.8H, v29.8H, v22.8B
+ uaddw2 v27.8H, v30.8H, v22.16B
+ st1 {v24.D}[1], [x9], x1
+ sqxtun v26.8B, v26.8H
+ sqxtun2 v26.16B, v27.8H
+ st1 {v25.D}[0], [x9], x1
+ st1 {v25.D}[1], [x9], x1
+ st1 {v26.D}[0], [x9], x1
+ st1 {v26.D}[1], [x9], x1
+
+ idct_end
+endfunc
+
+function ff_simple_idct_neon, export=1
+ idct_start x0
+
+ mov x2, x0
+ idct_row4_neon v24, v25, v26, v27, 1
+ idct_row4_neon v28, v29, v30, v31, 2
+ sub x2, x2, #128
+ bl idct_col4_neon1
+
+ sshr v1.8H, v7.8H, #COL_SHIFT-16
+ sshr v2.8H, v16.8H, #COL_SHIFT-16
+ sshr v3.8H, v17.8H, #COL_SHIFT-16
+ sshr v4.8H, v18.8H, #COL_SHIFT-16
+
+ bl idct_col4_neon2
+
+ sshr v7.8H, v7.8H, #COL_SHIFT-16
+ sshr v16.8H, v16.8H, #COL_SHIFT-16
+ sshr v17.8H, v17.8H, #COL_SHIFT-16
+ sshr v18.8H, v18.8H, #COL_SHIFT-16
+
+ zip1 v23.2D, v1.2D, v7.2D
+ zip2 v24.2D, v1.2D, v7.2D
+ st1 {v23.2D,v24.2D}, [x2], #32
+ zip1 v25.2D, v2.2D, v16.2D
+ zip2 v26.2D, v2.2D, v16.2D
+ st1 {v25.2D,v26.2D}, [x2], #32
+ zip1 v27.2D, v3.2D, v17.2D
+ zip2 v28.2D, v3.2D, v17.2D
+ st1 {v27.2D,v28.2D}, [x2], #32
+ zip1 v29.2D, v4.2D, v18.2D
+ zip2 v30.2D, v4.2D, v18.2D
+ st1 {v29.2D,v30.2D}, [x2], #32
+
+ idct_end
+endfunc
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/synth_filter_init.c
index d3430d045c..767b01112a 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/synth_filter_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -23,8 +23,8 @@
#include "libavutil/aarch64/cpu.h"
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
-#include "libavcodec/dcadsp.h"
#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
#include "asm-offsets.h"
@@ -32,25 +32,12 @@
AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
#endif
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
void ff_synth_filter_float_neon(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
float synth_buf2[32], const float window[512],
float out[32], const float in[32],
float scale);
-av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
- s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
- }
-}
-
av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
{
int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S
index b001c737da..8fcd71f252 100644
--- a/libavcodec/aarch64/synth_filter_neon.S
+++ b/libavcodec/aarch64/synth_filter_neon.S
@@ -2,20 +2,20 @@
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index ab97a97740..13dfd74940 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/videodsp.S b/libavcodec/aarch64/videodsp.S
index 7ce5a7ddf6..24067cc2af 100644
--- a/libavcodec/aarch64/videodsp.S
+++ b/libavcodec/aarch64/videodsp.S
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/videodsp_init.c b/libavcodec/aarch64/videodsp_init.c
index 59b697d4f4..6f667a6d3e 100644
--- a/libavcodec/aarch64/videodsp_init.c
+++ b/libavcodec/aarch64/videodsp_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/vorbisdsp_init.c b/libavcodec/aarch64/vorbisdsp_init.c
index 3559b54a30..c796f95e61 100644
--- a/libavcodec/aarch64/vorbisdsp_init.c
+++ b/libavcodec/aarch64/vorbisdsp_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/vorbisdsp_neon.S b/libavcodec/aarch64/vorbisdsp_neon.S
index 11f71f1d89..e76feebc54 100644
--- a/libavcodec/aarch64/vorbisdsp_neon.S
+++ b/libavcodec/aarch64/vorbisdsp_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/vp8dsp.h b/libavcodec/aarch64/vp8dsp.h
index 616252ebc7..871fed7a95 100644
--- a/libavcodec/aarch64/vp8dsp.h
+++ b/libavcodec/aarch64/vp8dsp.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c
index 723afb4afd..fc7e831d17 100644
--- a/libavcodec/aarch64/vp8dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S
index 604be8a8bf..aefe8fdcde 100644
--- a/libavcodec/aarch64/vp8dsp_neon.S
+++ b/libavcodec/aarch64/vp8dsp_neon.S
@@ -6,20 +6,20 @@
* Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
* Copyright (c) 2019 Martin Storsjo <martin@martin.st>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/vp9dsp_init.h b/libavcodec/aarch64/vp9dsp_init.h
new file mode 100644
index 0000000000..9df1752c62
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
+#define AVCODEC_AARCH64_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */
diff --git a/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c b/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
new file mode 100644
index 0000000000..0fa0d7f8c2
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
diff --git a/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c b/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
new file mode 100644
index 0000000000..dae2232403
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
diff --git a/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c b/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
new file mode 100644
index 0000000000..8dcfdeaaf7
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/aarch64/cpu.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz, suffix) \
+void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define decl_mc_func(op, filter, dir, sz, bpp) \
+void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz, bpp) \
+static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \
+ /* We only need h + 7 lines, but the horizontal filter assumes an \
+ * even number of rows, so filter h + 8 lines here. */ \
+ ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \
+ src - 3 * src_stride, src_stride, \
+ h + 8, mx, 0); \
+ ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \
+ temp + 3 * 2 * sz, 2 * sz, \
+ h, 0, my); \
+}
+
+#define decl_filter_funcs(op, dir, sz, bpp) \
+ decl_mc_func(op, regular, dir, sz, bpp); \
+ decl_mc_func(op, sharp, dir, sz, bpp); \
+ decl_mc_func(op, smooth, dir, sz, bpp)
+
+#define decl_mc_funcs(sz, bpp) \
+ decl_filter_funcs(put, h, sz, bpp); \
+ decl_filter_funcs(avg, h, sz, bpp); \
+ decl_filter_funcs(put, v, sz, bpp); \
+ decl_filter_funcs(avg, v, sz, bpp); \
+ decl_filter_funcs(put, hv, sz, bpp); \
+ decl_filter_funcs(avg, hv, sz, bpp)
+
+#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
+#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
+#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
+
+declare_fpel(copy, 128, );
+declare_fpel(copy, 64, );
+declare_fpel(copy, 32, );
+declare_fpel(copy, 16, );
+declare_fpel(copy, 8, );
+declare_fpel(avg, 64, _16);
+declare_fpel(avg, 32, _16);
+declare_fpel(avg, 16, _16);
+declare_fpel(avg, 8, _16);
+declare_fpel(avg, 4, _16);
+
+decl_mc_funcs(64, BPP);
+decl_mc_funcs(32, BPP);
+decl_mc_funcs(16, BPP);
+decl_mc_funcs(8, BPP);
+decl_mc_funcs(4, BPP);
+
+#define define_8tap_2d_funcs(sz, bpp) \
+ define_8tap_2d_fn(put, regular, sz, bpp) \
+ define_8tap_2d_fn(put, sharp, sz, bpp) \
+ define_8tap_2d_fn(put, smooth, sz, bpp) \
+ define_8tap_2d_fn(avg, regular, sz, bpp) \
+ define_8tap_2d_fn(avg, sharp, sz, bpp) \
+ define_8tap_2d_fn(avg, smooth, sz, bpp)
+
+define_8tap_2d_funcs(64, BPP)
+define_8tap_2d_funcs(32, BPP)
+define_8tap_2d_funcs(16, BPP)
+define_8tap_2d_funcs(8, BPP)
+define_8tap_2d_funcs(4, BPP)
+
+static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, suffix) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
+
+#define init_copy(idx, sz, suffix) \
+ init_fpel(idx, 0, sz, copy, suffix)
+
+#define init_avg(idx, sz, suffix) \
+ init_fpel(idx, 1, sz, avg, suffix)
+
+#define init_copy_avg(idx, sz1, sz2) \
+ init_copy(idx, sz2, _neon); \
+ init_avg (idx, sz1, _16_neon)
+
+ if (have_armv8(cpu_flags)) {
+ init_copy(0, 128, _aarch64);
+ init_copy(1, 64, _aarch64);
+ init_copy(2, 32, _aarch64);
+ }
+
+ if (have_neon(cpu_flags)) {
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
+ dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \
+ init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp)
+
+#define init_mc_funcs_dirs(idx, sz, bpp) \
+ init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \
+ init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \
+ init_mc_funcs(idx, hv, 1, 1, sz, , bpp)
+
+
+ init_avg(0, 64, _16_neon);
+ init_avg(1, 32, _16_neon);
+ init_avg(2, 16, _16_neon);
+ init_copy_avg(3, 8, 16);
+ init_copy_avg(4, 4, 8);
+
+ init_mc_funcs_dirs(0, 64, BPP);
+ init_mc_funcs_dirs(1, 32, BPP);
+ init_mc_funcs_dirs(2, 16, BPP);
+ init_mc_funcs_dirs(3, 8, BPP);
+ init_mc_funcs_dirs(4, 4, BPP);
+ }
+}
+
+#define define_itxfm2(type_a, type_b, sz, bpp) \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \
+ ptrdiff_t stride, \
+ int16_t *_block, int eob)
+#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
+
+#define define_itxfm_funcs(sz, bpp) \
+ define_itxfm(idct, idct, sz, bpp); \
+ define_itxfm(iadst, idct, sz, bpp); \
+ define_itxfm(idct, iadst, sz, bpp); \
+ define_itxfm(iadst, iadst, sz, bpp)
+
+define_itxfm_funcs(4, BPP);
+define_itxfm_funcs(8, BPP);
+define_itxfm_funcs(16, BPP);
+define_itxfm(idct, idct, 32, BPP);
+define_itxfm(iwht, iwht, 4, BPP);
+
+
+static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_itxfm2(tx, sz, bpp) \
+ dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \
+ dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
+ dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
+ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
+#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
+
+#define init_idct2(tx, nm, bpp) \
+ dsp->itxfm_add[tx][DCT_DCT] = \
+ dsp->itxfm_add[tx][ADST_DCT] = \
+ dsp->itxfm_add[tx][DCT_ADST] = \
+ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
+#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
+
+ init_itxfm(TX_4X4, 4x4, BPP);
+ init_itxfm(TX_8X8, 8x8, BPP);
+ init_itxfm(TX_16X16, 16x16, BPP);
+ init_idct(TX_32X32, idct_idct_32x32, BPP);
+ init_idct(4, iwht_iwht_4x4, BPP);
+ }
+}
+
+#define define_loop_filter(dir, wd, size, bpp) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size, bpp) \
+ define_loop_filter(h, wd, size, bpp); \
+ define_loop_filter(v, wd, size, bpp)
+
+define_loop_filters(4, 8, BPP);
+define_loop_filters(8, 8, BPP);
+define_loop_filters(16, 8, BPP);
+
+define_loop_filters(16, 16, BPP);
+
+define_loop_filters(44, 16, BPP);
+define_loop_filters(48, 16, BPP);
+define_loop_filters(84, 16, BPP);
+define_loop_filters(88, 16, BPP);
+
+static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
+ dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
+
+#define init_lpf_func_16(idx, dir, bpp) \
+ dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
+
+#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
+ dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
+
+#define init_lpf_funcs_8_wd(idx, wd, bpp) \
+ init_lpf_func_8(idx, 0, h, wd, bpp); \
+ init_lpf_func_8(idx, 1, v, wd, bpp)
+
+#define init_lpf_funcs_16(bpp) \
+ init_lpf_func_16(0, h, bpp); \
+ init_lpf_func_16(1, v, bpp)
+
+#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
+ init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \
+ init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
+
+#define init_lpf_funcs_8(bpp) \
+ init_lpf_funcs_8_wd(0, 4, bpp); \
+ init_lpf_funcs_8_wd(1, 8, bpp); \
+ init_lpf_funcs_8_wd(2, 16, bpp)
+
+#define init_lpf_funcs_mix2(bpp) \
+ init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
+ init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
+ init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
+ init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
+
+ init_lpf_funcs_8(BPP);
+ init_lpf_funcs_16(BPP);
+ init_lpf_funcs_mix2(BPP);
+ }
+}
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+ vp9dsp_mc_init_aarch64(dsp);
+ vp9dsp_loopfilter_init_aarch64(dsp);
+ vp9dsp_itxfm_init_aarch64(dsp);
+}
diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c b/libavcodec/aarch64/vp9dsp_init_aarch64.c
index 3ce2c1b2b9..4c699759fe 100644
--- a/libavcodec/aarch64/vp9dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c
@@ -1,28 +1,30 @@
/*
* Copyright (c) 2016 Google Inc.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
#include "libavutil/aarch64/cpu.h"
-#include "libavcodec/vp9.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_init.h"
#define declare_fpel(type, sz) \
void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
@@ -239,8 +241,17 @@ static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
}
}
-av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp)
+av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
{
+ if (bpp == 10) {
+ ff_vp9dsp_init_10bpp_aarch64(dsp);
+ return;
+ } else if (bpp == 12) {
+ ff_vp9dsp_init_12bpp_aarch64(dsp);
+ return;
+ } else if (bpp != 8)
+ return;
+
vp9dsp_mc_init_aarch64(dsp);
vp9dsp_loopfilter_init_aarch64(dsp);
vp9dsp_itxfm_init_aarch64(dsp);
diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
new file mode 100644
index 0000000000..68296d9c40
--- /dev/null
+++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@@ -0,0 +1,2017 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+ .short 11585, 0, 6270, 15137
+iadst4_coeffs:
+ .short 5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+ .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+ .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+ .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+ .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+ .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+ .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+ .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
+ trn1 \r4\().4s, \r0\().4s, \r1\().4s
+ trn2 \r5\().4s, \r0\().4s, \r1\().4s
+ trn1 \r6\().4s, \r2\().4s, \r3\().4s
+ trn2 \r7\().4s, \r2\().4s, \r3\().4s
+ trn1 \r0\().2d, \r4\().2d, \r6\().2d
+ trn2 \r2\().2d, \r4\().2d, \r6\().2d
+ trn1 \r1\().2d, \r5\().2d, \r7\().2d
+ trn2 \r3\().2d, \r5\().2d, \r7\().2d
+.endm
+
+// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
+// over two registers.
+.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
+ transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3
+ transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3
+
+ // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
+ // while swapping the two 4x4 matrices between each other
+
+ // First step of the 4x4 transpose of r1-r7, into t0-t3
+ trn1 \t0\().4s, \r1\().4s, \r3\().4s
+ trn2 \t1\().4s, \r1\().4s, \r3\().4s
+ trn1 \t2\().4s, \r5\().4s, \r7\().4s
+ trn2 \t3\().4s, \r5\().4s, \r7\().4s
+
+ // First step of the 4x4 transpose of r8-r12, into r1-r7
+ trn1 \r1\().4s, \r8\().4s, \r10\().4s
+ trn2 \r3\().4s, \r8\().4s, \r10\().4s
+ trn1 \r5\().4s, \r12\().4s, \r14\().4s
+ trn2 \r7\().4s, \r12\().4s, \r14\().4s
+
+ // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
+ trn1 \r8\().2d, \t0\().2d, \t2\().2d
+ trn2 \r12\().2d, \t0\().2d, \t2\().2d
+ trn1 \r10\().2d, \t1\().2d, \t3\().2d
+ trn2 \r14\().2d, \t1\().2d, \t3\().2d
+
+ // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
+ trn1 \t0\().2d, \r1\().2d, \r5\().2d
+ trn2 \r5\().2d, \r1\().2d, \r5\().2d
+ trn1 \t1\().2d, \r3\().2d, \r7\().2d
+ trn2 \r7\().2d, \r3\().2d, \r7\().2d
+
+ // Move the outputs of trn1 back in place
+ mov \r1\().16b, \t0\().16b
+ mov \r3\().16b, \t1\().16b
+.endm
+
+// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+// in/out are .4s registers; this can do with 4 temp registers, but is
+// more efficient if 6 temp registers are available.
+.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
+.if \neg > 0
+ neg \tmp4\().4s, v0.4s
+.endif
+ add \tmp1\().4s, \in1\().4s, \in2\().4s
+ sub \tmp2\().4s, \in1\().4s, \in2\().4s
+.if \neg > 0
+ smull \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
+ smull2 \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
+.else
+ smull \tmp3\().2d, \tmp1\().2s, v0.s[0]
+ smull2 \tmp4\().2d, \tmp1\().4s, v0.s[0]
+.endif
+.ifb \tmp5
+ rshrn \out1\().2s, \tmp3\().2d, #14
+ rshrn2 \out1\().4s, \tmp4\().2d, #14
+ smull \tmp3\().2d, \tmp2\().2s, v0.s[0]
+ smull2 \tmp4\().2d, \tmp2\().4s, v0.s[0]
+ rshrn \out2\().2s, \tmp3\().2d, #14
+ rshrn2 \out2\().4s, \tmp4\().2d, #14
+.else
+ smull \tmp5\().2d, \tmp2\().2s, v0.s[0]
+ smull2 \tmp6\().2d, \tmp2\().4s, v0.s[0]
+ rshrn \out1\().2s, \tmp3\().2d, #14
+ rshrn2 \out1\().4s, \tmp4\().2d, #14
+ rshrn \out2\().2s, \tmp5\().2d, #14
+ rshrn2 \out2\().4s, \tmp6\().2d, #14
+.endif
+.endm
+
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+ smull \tmp1\().2d, \in1\().2s, v0.s[0]
+ smull2 \tmp2\().2d, \in1\().4s, v0.s[0]
+ rshrn \out1\().2s, \tmp1\().2d, #14
+ rshrn2 \out1\().4s, \tmp2\().2d, #14
+ rshrn \out2\().2s, \tmp1\().2d, #14
+ rshrn2 \out2\().4s, \tmp2\().2d, #14
+.endm
+
+// out1,out2 = in1 * coef1 - in2 * coef2
+// out3,out4 = in1 * coef2 + in2 * coef1
+// out are 4 x .2d registers, in are 2 x .4s registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
+ smull \out1\().2d, \in1\().2s, \coef1
+ smull2 \out2\().2d, \in1\().4s, \coef1
+ smull \out3\().2d, \in1\().2s, \coef2
+ smull2 \out4\().2d, \in1\().4s, \coef2
+ smlsl \out1\().2d, \in2\().2s, \coef2
+ smlsl2 \out2\().2d, \in2\().4s, \coef2
+ smlal \out3\().2d, \in2\().2s, \coef1
+ smlal2 \out4\().2d, \in2\().4s, \coef1
+.endm
+
+// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+// inout are 2 x .4s registers
+.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
+ dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
+.if \neg > 0
+ neg \tmp3\().2d, \tmp3\().2d
+ neg \tmp4\().2d, \tmp4\().2d
+.endif
+ rshrn \inout1\().2s, \tmp1\().2d, #14
+ rshrn2 \inout1\().4s, \tmp2\().2d, #14
+ rshrn \inout2\().2s, \tmp3\().2d, #14
+ rshrn2 \inout2\().4s, \tmp4\().2d, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ smull \tmp1\().2d, \inout1\().2s, \coef1
+ smull2 \tmp2\().2d, \inout1\().4s, \coef1
+ smull \tmp3\().2d, \inout1\().2s, \coef2
+ smull2 \tmp4\().2d, \inout1\().4s, \coef2
+ rshrn \inout1\().2s, \tmp1\().2d, #14
+ rshrn2 \inout1\().4s, \tmp2\().2d, #14
+ rshrn \inout2\().2s, \tmp3\().2d, #14
+ rshrn2 \inout2\().4s, \tmp4\().2d, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ smull \tmp1\().2d, \inout2\().2s, \coef2
+ smull2 \tmp2\().2d, \inout2\().4s, \coef2
+ smull \tmp3\().2d, \inout2\().2s, \coef1
+ smull2 \tmp4\().2d, \inout2\().4s, \coef1
+ neg \tmp1\().2d, \tmp1\().2d
+ neg \tmp2\().2d, \tmp2\().2d
+ rshrn \inout2\().2s, \tmp3\().2d, #14
+ rshrn2 \inout2\().4s, \tmp4\().2d, #14
+ rshrn \inout1\().2s, \tmp1\().2d, #14
+ rshrn2 \inout1\().4s, \tmp2\().2d, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+ smull \out1\().2d, \in\().2s, \coef
+ smull2 \out2\().2d, \in\().4s, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+ rshrn \out\().2s, \in1\().2d, \shift
+ rshrn2 \out\().4s, \in2\().2d, \shift
+.endm
+
+
+// out1 = in1 + in2
+// out2 = in1 - in2
+.macro butterfly_4s out1, out2, in1, in2
+ add \out1\().4s, \in1\().4s, \in2\().4s
+ sub \out2\().4s, \in1\().4s, \in2\().4s
+.endm
+
+// out1 = in1 - in2
+// out2 = in1 + in2
+.macro butterfly_4s_r out1, out2, in1, in2
+ sub \out1\().4s, \in1\().4s, \in2\().4s
+ add \out2\().4s, \in1\().4s, \in2\().4s
+.endm
+
+// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+// out are 2 x .4s registers, in are 4 x .2d registers
+.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+ add \tmp1\().2d, \in1\().2d, \in3\().2d
+ add \tmp2\().2d, \in2\().2d, \in4\().2d
+ sub \tmp3\().2d, \in1\().2d, \in3\().2d
+ sub \tmp4\().2d, \in2\().2d, \in4\().2d
+ rshrn \out1\().2s, \tmp1\().2d, #14
+ rshrn2 \out1\().4s, \tmp2\().2d, #14
+ rshrn \out2\().2s, \tmp3\().2d, #14
+ rshrn2 \out2\().4s, \tmp4\().2d, #14
+.endm
+
+.macro iwht4_10 c0, c1, c2, c3
+ add \c0\().4s, \c0\().4s, \c1\().4s
+ sub v17.4s, \c2\().4s, \c3\().4s
+ sub v16.4s, \c0\().4s, v17.4s
+ sshr v16.4s, v16.4s, #1
+ sub \c2\().4s, v16.4s, \c1\().4s
+ sub \c1\().4s, v16.4s, \c3\().4s
+ add \c3\().4s, v17.4s, \c2\().4s
+ sub \c0\().4s, \c0\().4s, \c1\().4s
+.endm
+
+.macro iwht4_12 c0, c1, c2, c3
+ iwht4_10 \c0, \c1, \c2, \c3
+.endm
+
+.macro idct4_10 c0, c1, c2, c3
+ mul v22.4s, \c1\().4s, v0.s[3]
+ mul v20.4s, \c1\().4s, v0.s[2]
+ add v16.4s, \c0\().4s, \c2\().4s
+ sub v17.4s, \c0\().4s, \c2\().4s
+ mla v22.4s, \c3\().4s, v0.s[2]
+ mul v18.4s, v16.4s, v0.s[0]
+ mul v24.4s, v17.4s, v0.s[0]
+ mls v20.4s, \c3\().4s, v0.s[3]
+ srshr v22.4s, v22.4s, #14
+ srshr v18.4s, v18.4s, #14
+ srshr v24.4s, v24.4s, #14
+ srshr v20.4s, v20.4s, #14
+ add \c0\().4s, v18.4s, v22.4s
+ sub \c3\().4s, v18.4s, v22.4s
+ add \c1\().4s, v24.4s, v20.4s
+ sub \c2\().4s, v24.4s, v20.4s
+.endm
+
+.macro idct4_12 c0, c1, c2, c3
+ smull v22.2d, \c1\().2s, v0.s[3]
+ smull2 v23.2d, \c1\().4s, v0.s[3]
+ smull v20.2d, \c1\().2s, v0.s[2]
+ smull2 v21.2d, \c1\().4s, v0.s[2]
+ add v16.4s, \c0\().4s, \c2\().4s
+ sub v17.4s, \c0\().4s, \c2\().4s
+ smlal v22.2d, \c3\().2s, v0.s[2]
+ smlal2 v23.2d, \c3\().4s, v0.s[2]
+ smull v18.2d, v16.2s, v0.s[0]
+ smull2 v19.2d, v16.4s, v0.s[0]
+ smull v24.2d, v17.2s, v0.s[0]
+ smull2 v25.2d, v17.4s, v0.s[0]
+ smlsl v20.2d, \c3\().2s, v0.s[3]
+ smlsl2 v21.2d, \c3\().4s, v0.s[3]
+ rshrn v22.2s, v22.2d, #14
+ rshrn2 v22.4s, v23.2d, #14
+ rshrn v18.2s, v18.2d, #14
+ rshrn2 v18.4s, v19.2d, #14
+ rshrn v24.2s, v24.2d, #14
+ rshrn2 v24.4s, v25.2d, #14
+ rshrn v20.2s, v20.2d, #14
+ rshrn2 v20.4s, v21.2d, #14
+ add \c0\().4s, v18.4s, v22.4s
+ sub \c3\().4s, v18.4s, v22.4s
+ add \c1\().4s, v24.4s, v20.4s
+ sub \c2\().4s, v24.4s, v20.4s
+.endm
+
+.macro iadst4_10 c0, c1, c2, c3
+ mul v16.4s, \c0\().4s, v1.s[0]
+ mla v16.4s, \c2\().4s, v1.s[1]
+ mla v16.4s, \c3\().4s, v1.s[2]
+ mul v18.4s, \c0\().4s, v1.s[2]
+ mls v18.4s, \c2\().4s, v1.s[0]
+ sub \c0\().4s, \c0\().4s, \c2\().4s
+ mls v18.4s, \c3\().4s, v1.s[1]
+ add \c0\().4s, \c0\().4s, \c3\().4s
+ mul v22.4s, \c1\().4s, v1.s[3]
+ mul v20.4s, \c0\().4s, v1.s[3]
+ add v24.4s, v16.4s, v22.4s
+ add v26.4s, v18.4s, v22.4s
+ srshr \c0\().4s, v24.4s, #14
+ add v16.4s, v16.4s, v18.4s
+ srshr \c1\().4s, v26.4s, #14
+ sub v16.4s, v16.4s, v22.4s
+ srshr \c2\().4s, v20.4s, #14
+ srshr \c3\().4s, v16.4s, #14
+.endm
+
+.macro iadst4_12 c0, c1, c2, c3
+ smull v16.2d, \c0\().2s, v1.s[0]
+ smull2 v17.2d, \c0\().4s, v1.s[0]
+ smlal v16.2d, \c2\().2s, v1.s[1]
+ smlal2 v17.2d, \c2\().4s, v1.s[1]
+ smlal v16.2d, \c3\().2s, v1.s[2]
+ smlal2 v17.2d, \c3\().4s, v1.s[2]
+ smull v18.2d, \c0\().2s, v1.s[2]
+ smull2 v19.2d, \c0\().4s, v1.s[2]
+ smlsl v18.2d, \c2\().2s, v1.s[0]
+ smlsl2 v19.2d, \c2\().4s, v1.s[0]
+ sub \c0\().4s, \c0\().4s, \c2\().4s
+ smlsl v18.2d, \c3\().2s, v1.s[1]
+ smlsl2 v19.2d, \c3\().4s, v1.s[1]
+ add \c0\().4s, \c0\().4s, \c3\().4s
+ smull v22.2d, \c1\().2s, v1.s[3]
+ smull2 v23.2d, \c1\().4s, v1.s[3]
+ smull v20.2d, \c0\().2s, v1.s[3]
+ smull2 v21.2d, \c0\().4s, v1.s[3]
+ add v24.2d, v16.2d, v22.2d
+ add v25.2d, v17.2d, v23.2d
+ add v26.2d, v18.2d, v22.2d
+ add v27.2d, v19.2d, v23.2d
+ rshrn \c0\().2s, v24.2d, #14
+ rshrn2 \c0\().4s, v25.2d, #14
+ add v16.2d, v16.2d, v18.2d
+ add v17.2d, v17.2d, v19.2d
+ rshrn \c1\().2s, v26.2d, #14
+ rshrn2 \c1\().4s, v27.2d, #14
+ sub v16.2d, v16.2d, v22.2d
+ sub v17.2d, v17.2d, v23.2d
+ rshrn \c2\().2s, v20.2d, #14
+ rshrn2 \c2\().4s, v21.2d, #14
+ rshrn \c3\().2s, v16.2d, #14
+ rshrn2 \c3\().4s, v17.2d, #14
+.endm
+
+// The public functions in this file have got the following signature:
+// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2, bpp
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+ movrel x4, itxfm4_coeffs
+ ld1 {v0.4h}, [x4]
+ sxtl v0.4s, v0.4h
+.endif
+.ifc \txfm1,iadst
+ movrel x4, iadst4_coeffs
+ ld1 {v0.d}[1], [x4]
+ sxtl2 v1.4s, v0.8h
+.endif
+.else
+ movrel x4, itxfm4_coeffs
+ ld1 {v0.8h}, [x4]
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+.endif
+
+ movi v30.4s, #0
+ movi v31.4s, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #1
+ b.ne 1f
+ // DC-only for idct/idct
+ ld1 {v2.s}[0], [x2]
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ st1 {v31.s}[0], [x2]
+ dup v4.4s, v2.s[0]
+ mov v5.16b, v4.16b
+ mov v6.16b, v4.16b
+ mov v7.16b, v4.16b
+ b 2f
+.endif
+
+1:
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2]
+ st1 {v30.4s,v31.4s}, [x2], #32
+
+.ifc \txfm1,iwht
+ sshr v4.4s, v4.4s, #2
+ sshr v5.4s, v5.4s, #2
+ sshr v6.4s, v6.4s, #2
+ sshr v7.4s, v7.4s, #2
+.endif
+
+ \txfm1\()4_\bpp v4, v5, v6, v7
+
+ st1 {v30.4s,v31.4s}, [x2], #32
+ // Transpose 4x4 with 32 bit elements
+ transpose_4x4s v4, v5, v6, v7, v16, v17, v18, v19
+
+ \txfm2\()4_\bpp v4, v5, v6, v7
+2:
+ mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+ ld1 {v0.4h}, [x0], x1
+ ld1 {v1.4h}, [x0], x1
+.ifnc \txfm1,iwht
+ srshr v4.4s, v4.4s, #4
+ srshr v5.4s, v5.4s, #4
+ srshr v6.4s, v6.4s, #4
+ srshr v7.4s, v7.4s, #4
+.endif
+ uaddw v4.4s, v4.4s, v0.4h
+ uaddw v5.4s, v5.4s, v1.4h
+ ld1 {v2.4h}, [x0], x1
+ ld1 {v3.4h}, [x0], x1
+ sqxtun v0.4h, v4.4s
+ sqxtun2 v0.8h, v5.4s
+ sub x0, x0, x1, lsl #2
+
+ uaddw v6.4s, v6.4s, v2.4h
+ umin v0.8h, v0.8h, v31.8h
+ uaddw v7.4s, v7.4s, v3.4h
+ st1 {v0.4h}, [x0], x1
+ sqxtun v2.4h, v6.4s
+ sqxtun2 v2.8h, v7.4s
+ umin v2.8h, v2.8h, v31.8h
+
+ st1 {v0.d}[1], [x0], x1
+ st1 {v2.4h}, [x0], x1
+ st1 {v2.d}[1], [x0], x1
+
+ ret
+endfunc
+.endm
+
+.macro itxfm_funcs4x4 bpp
+itxfm_func4x4 idct, idct, \bpp
+itxfm_func4x4 iadst, idct, \bpp
+itxfm_func4x4 idct, iadst, \bpp
+itxfm_func4x4 iadst, iadst, \bpp
+itxfm_func4x4 iwht, iwht, \bpp
+.endm
+
+itxfm_funcs4x4 10
+itxfm_funcs4x4 12
+
+function idct8x8_dc_add_neon
+ movrel x4, idct_coeffs
+ ld1 {v0.4h}, [x4]
+
+ movi v1.4h, #0
+ sxtl v0.4s, v0.4h
+
+ ld1 {v2.s}[0], [x2]
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ st1 {v1.s}[0], [x2]
+ dup v2.4s, v2.s[0]
+
+ srshr v2.4s, v2.4s, #5
+
+ mov x4, #8
+ mov x3, x0
+ dup v31.8h, w5
+1:
+ // Loop to add the constant from v2 into all 8x8 outputs
+ subs x4, x4, #2
+ ld1 {v3.8h}, [x0], x1
+ ld1 {v4.8h}, [x0], x1
+ uaddw v16.4s, v2.4s, v3.4h
+ uaddw2 v17.4s, v2.4s, v3.8h
+ uaddw v18.4s, v2.4s, v4.4h
+ uaddw2 v19.4s, v2.4s, v4.8h
+ sqxtun v3.4h, v16.4s
+ sqxtun2 v3.8h, v17.4s
+ sqxtun v4.4h, v18.4s
+ sqxtun2 v4.8h, v19.4s
+ umin v3.8h, v3.8h, v31.8h
+ umin v4.8h, v4.8h, v31.8h
+ st1 {v3.8h}, [x3], x1
+ st1 {v4.8h}, [x3], x1
+ b.ne 1b
+
+ ret
+endfunc
+
+.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
+ dmbutterfly0 \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
+ dmbutterfly \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3 // r2 = t2a, r6 = t3a
+ dmbutterfly \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3 // r1 = t4a, r7 = t7a
+ dmbutterfly \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3 // r5 = t5a, r3 = t6a
+
+ butterfly_4s \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
+ butterfly_4s \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
+ butterfly_4s \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
+ butterfly_4s \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
+
+ dmbutterfly0 \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
+
+ butterfly_4s \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
+ butterfly_4s \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
+ butterfly_4s \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
+ butterfly_4s \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
+.endm
+
+.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
+ dmbutterfly_l \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0] // t2,t3 = t1a, t0,t1 = t0a
+ dmbutterfly_l \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0] // r0,r7 = t5a, t4,t5 = t4a
+
+ dbutterfly_n \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
+ dbutterfly_n \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
+
+ dmbutterfly_l \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2] // t4,t5 = t3a, t2,t3 = t2a
+ dmbutterfly_l \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2] // r2,r5 = t7a, r0,r7 = t6a
+
+ dbutterfly_n \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
+ dbutterfly_n \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
+
+ butterfly_4s \r7, \r4, \r4, \r0 // r7 = -out[7], r4 = t3
+ neg \r7\().4s, \r7\().4s // r7 = out[7]
+ butterfly_4s \r0, \r1, \r3, \r1 // r0 = out[0], r1 = t2
+
+ dmbutterfly_l \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3] // r2,r3 = t5a, t3,t5 = t4a
+ dmbutterfly_l \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2] // t0,t1 = t6a, r5,r6 = t7a
+
+ dbutterfly_n \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6], t2 = t7
+
+ dmbutterfly0 \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2 // r3 = -out[3], r4 = out[4]
+ neg \r3\().4s, \r3\().4s // r3 = out[3]
+
+ dbutterfly_n \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
+ neg \r1\().4s, \r1\().4s // r1 = out[1]
+
+ dmbutterfly0 \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5 // r2 = out[2], r5 = -out[5]
+ neg \r5\().4s, \r5\().4s // r5 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #1
+ b.eq idct8x8_dc_add_neon
+.endif
+ // The iadst also uses a few coefficients from
+ // idct, so those always need to be loaded.
+.ifc \txfm1\()_\txfm2,idct_idct
+ movrel x4, idct_coeffs
+.else
+ movrel x4, iadst8_coeffs
+ ld1 {v1.8h}, [x4], #16
+ stp d8, d9, [sp, #-0x10]!
+ sxtl2 v3.4s, v1.8h
+ sxtl v2.4s, v1.4h
+.endif
+ ld1 {v0.8h}, [x4]
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+
+ movi v4.4s, #0
+ movi v5.4s, #0
+ movi v6.4s, #0
+ movi v7.4s, #0
+
+1:
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2], #64
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64
+ ld1 {v24.4s,v25.4s,v26.4s,v27.4s}, [x2], #64
+ ld1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
+ sub x2, x2, #256
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
+ st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
+ idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
+.else
+ \txfm1\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
+ \txfm1\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
+.endif
+
+ // Transpose 8x8 with 16 bit elements
+ transpose_8x8s v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
+ idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
+.else
+ \txfm2\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
+ \txfm2\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
+.endif
+2:
+ mov x3, x0
+ // Add into the destination
+ ld1 {v0.8h}, [x0], x1
+ srshr v16.4s, v16.4s, #5
+ srshr v17.4s, v17.4s, #5
+ ld1 {v1.8h}, [x0], x1
+ srshr v18.4s, v18.4s, #5
+ srshr v19.4s, v19.4s, #5
+ ld1 {v2.8h}, [x0], x1
+ srshr v20.4s, v20.4s, #5
+ srshr v21.4s, v21.4s, #5
+ uaddw v16.4s, v16.4s, v0.4h
+ uaddw2 v17.4s, v17.4s, v0.8h
+ ld1 {v3.8h}, [x0], x1
+ srshr v22.4s, v22.4s, #5
+ srshr v23.4s, v23.4s, #5
+ uaddw v18.4s, v18.4s, v1.4h
+ uaddw2 v19.4s, v19.4s, v1.8h
+ ld1 {v4.8h}, [x0], x1
+ srshr v24.4s, v24.4s, #5
+ srshr v25.4s, v25.4s, #5
+ uaddw v20.4s, v20.4s, v2.4h
+ uaddw2 v21.4s, v21.4s, v2.8h
+ sqxtun v0.4h, v16.4s
+ sqxtun2 v0.8h, v17.4s
+ dup v16.8h, w5
+ ld1 {v5.8h}, [x0], x1
+ srshr v26.4s, v26.4s, #5
+ srshr v27.4s, v27.4s, #5
+ uaddw v22.4s, v22.4s, v3.4h
+ uaddw2 v23.4s, v23.4s, v3.8h
+ sqxtun v1.4h, v18.4s
+ sqxtun2 v1.8h, v19.4s
+ umin v0.8h, v0.8h, v16.8h
+ ld1 {v6.8h}, [x0], x1
+ srshr v28.4s, v28.4s, #5
+ srshr v29.4s, v29.4s, #5
+ uaddw v24.4s, v24.4s, v4.4h
+ uaddw2 v25.4s, v25.4s, v4.8h
+ sqxtun v2.4h, v20.4s
+ sqxtun2 v2.8h, v21.4s
+ umin v1.8h, v1.8h, v16.8h
+ ld1 {v7.8h}, [x0], x1
+ srshr v30.4s, v30.4s, #5
+ srshr v31.4s, v31.4s, #5
+ uaddw v26.4s, v26.4s, v5.4h
+ uaddw2 v27.4s, v27.4s, v5.8h
+ sqxtun v3.4h, v22.4s
+ sqxtun2 v3.8h, v23.4s
+ umin v2.8h, v2.8h, v16.8h
+
+ st1 {v0.8h}, [x3], x1
+ uaddw v28.4s, v28.4s, v6.4h
+ uaddw2 v29.4s, v29.4s, v6.8h
+ st1 {v1.8h}, [x3], x1
+ sqxtun v4.4h, v24.4s
+ sqxtun2 v4.8h, v25.4s
+ umin v3.8h, v3.8h, v16.8h
+ st1 {v2.8h}, [x3], x1
+ uaddw v30.4s, v30.4s, v7.4h
+ uaddw2 v31.4s, v31.4s, v7.8h
+ st1 {v3.8h}, [x3], x1
+ sqxtun v5.4h, v26.4s
+ sqxtun2 v5.8h, v27.4s
+ umin v4.8h, v4.8h, v16.8h
+ st1 {v4.8h}, [x3], x1
+ sqxtun v6.4h, v28.4s
+ sqxtun2 v6.8h, v29.4s
+ umin v5.8h, v5.8h, v16.8h
+ st1 {v5.8h}, [x3], x1
+ sqxtun v7.4h, v30.4s
+ sqxtun2 v7.8h, v31.4s
+ umin v6.8h, v6.8h, v16.8h
+
+ st1 {v6.8h}, [x3], x1
+ umin v7.8h, v7.8h, v16.8h
+ st1 {v7.8h}, [x3], x1
+
+.ifnc \txfm1\()_\txfm2,idct_idct
+ ldp d8, d9, [sp], 0x10
+.endif
+ ret
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
+ mov x5, #0x03ff
+ b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
+ mov x5, #0x0fff
+ b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+.endm
+
+itxfm_func8x8 idct, idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct, iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+ movrel x4, idct_coeffs
+ ld1 {v0.4h}, [x4]
+ sxtl v0.4s, v0.4h
+
+ movi v1.4h, #0
+
+ ld1 {v2.s}[0], [x2]
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ st1 {v1.s}[0], [x2]
+ dup v2.4s, v2.s[0]
+
+ srshr v0.4s, v2.4s, #6
+
+ mov x3, x0
+ mov x4, #16
+ dup v31.8h, w13
+1:
+ // Loop to add the constant from v2 into all 16x16 outputs
+ subs x4, x4, #2
+ ld1 {v1.8h,v2.8h}, [x0], x1
+ uaddw v16.4s, v0.4s, v1.4h
+ uaddw2 v17.4s, v0.4s, v1.8h
+ ld1 {v3.8h,v4.8h}, [x0], x1
+ uaddw v18.4s, v0.4s, v2.4h
+ uaddw2 v19.4s, v0.4s, v2.8h
+ uaddw v20.4s, v0.4s, v3.4h
+ uaddw2 v21.4s, v0.4s, v3.8h
+ uaddw v22.4s, v0.4s, v4.4h
+ uaddw2 v23.4s, v0.4s, v4.8h
+ sqxtun v1.4h, v16.4s
+ sqxtun2 v1.8h, v17.4s
+ sqxtun v2.4h, v18.4s
+ sqxtun2 v2.8h, v19.4s
+ sqxtun v3.4h, v20.4s
+ sqxtun2 v3.8h, v21.4s
+ sqxtun v4.4h, v22.4s
+ sqxtun2 v4.8h, v23.4s
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ st1 {v1.8h,v2.8h}, [x3], x1
+ umin v3.8h, v3.8h, v31.8h
+ umin v4.8h, v4.8h, v31.8h
+ st1 {v3.8h,v4.8h}, [x3], x1
+ b.ne 1b
+
+ ret
+endfunc
+
+.macro idct16_end
+ butterfly_4s v18, v7, v4, v7 // v18 = t0a, v7 = t7a
+ butterfly_4s v19, v22, v5, v22 // v19 = t1a, v22 = t6
+ butterfly_4s v4, v26, v20, v26 // v4 = t2a, v26 = t5
+ butterfly_4s v5, v6, v28, v6 // v5 = t3a, v6 = t4
+ butterfly_4s v20, v28, v16, v24 // v20 = t8a, v28 = t11a
+ butterfly_4s v24, v21, v23, v21 // v24 = t9, v21 = t10
+ butterfly_4s v23, v27, v25, v27 // v23 = t14, v27 = t13
+ butterfly_4s v25, v29, v29, v17 // v25 = t15a, v29 = t12a
+
+ dmbutterfly0 v8, v9, v27, v21, v8, v9, v16, v17, v30, v31 // v8 = t13a, v9 = t10a
+ dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
+
+ butterfly_4s v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
+ butterfly_4s v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
+ butterfly_4s_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
+ butterfly_4s v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
+ butterfly_4s v18, v29, v4, v8 // v18 = out[2], v29 = out[13]
+ butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
+ butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
+ butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10]
+ ret
+.endm
+
+function idct16
+ dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a
+ dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a
+ dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a
+ dmbutterfly v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a
+ dmbutterfly v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a
+ dmbutterfly v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a
+ dmbutterfly v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
+ dmbutterfly v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
+
+ butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3
+ butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2
+ butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5
+ butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6
+ butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9
+ butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10
+ butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13
+ butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14
+
+ dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
+ dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
+ dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+ idct16_end
+endfunc
+
+function idct16_half
+ dmbutterfly0_h v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a
+ dmbutterfly_h1 v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a
+ dmbutterfly_h1 v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a
+ dmbutterfly_h2 v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a
+ dmbutterfly_h1 v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a
+ dmbutterfly_h2 v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a
+ dmbutterfly_h1 v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
+ dmbutterfly_h2 v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
+
+ butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3
+ butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2
+ butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5
+ butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6
+ butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9
+ butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10
+ butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13
+ butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14
+
+ dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
+ dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
+ dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+ idct16_end
+endfunc
+
+function idct16_quarter
+ dsmull_h v24, v25, v19, v3.s[3]
+ dsmull_h v4, v5, v17, v2.s[0]
+ dsmull_h v7, v6, v18, v1.s[1]
+ dsmull_h v30, v31, v18, v1.s[0]
+ neg v24.2d, v24.2d
+ neg v25.2d, v25.2d
+ dsmull_h v29, v28, v17, v2.s[1]
+ dsmull_h v26, v27, v19, v3.s[2]
+ dsmull_h v22, v23, v16, v0.s[0]
+ drshrn_h v24, v24, v25, #14
+ drshrn_h v16, v4, v5, #14
+ drshrn_h v7, v7, v6, #14
+ drshrn_h v6, v30, v31, #14
+ drshrn_h v29, v29, v28, #14
+ drshrn_h v17, v26, v27, #14
+ drshrn_h v28, v22, v23, #14
+
+ dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
+ dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
+ neg v22.2d, v22.2d
+ neg v23.2d, v23.2d
+ drshrn_h v27, v20, v21, #14
+ drshrn_h v21, v22, v23, #14
+ drshrn_h v23, v18, v19, #14
+ drshrn_h v25, v30, v31, #14
+ mov v4.16b, v28.16b
+ mov v5.16b, v28.16b
+ dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
+ mov v20.16b, v28.16b
+ idct16_end
+endfunc
+
+function iadst16
+ ld1 {v0.8h,v1.8h}, [x11]
+ sxtl v2.4s, v1.4h
+ sxtl2 v3.4s, v1.8h
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+
+ dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.s[1], v0.s[0] // v6,v7 = t1, v4,v5 = t0
+ dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.s[1], v1.s[0] // v10,v11 = t9, v8,v9 = t8
+ dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
+ dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2] // v14,v15 = t3, v12,v13 = t2
+ dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
+
+ dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.s[3], v1.s[2] // v6,v7 = t11, v4,v5 = t10
+ dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
+ dmbutterfly_l v10, v11, v8, v9, v27, v20, v2.s[1], v2.s[0] // v10,v11 = t5, v8,v9 = t4
+ dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
+
+ dmbutterfly_l v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0] // v14,v15 = t13, v12,v13 = t12
+ dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
+ dmbutterfly_l v6, v7, v4, v5, v25, v22, v2.s[3], v2.s[2] // v6,v7 = t7, v4,v5 = t6
+ dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
+
+ dmbutterfly_l v10, v11, v8, v9, v17, v30, v3.s[3], v3.s[2] // v10,v11 = t15, v8,v9 = t14
+ ld1 {v0.8h}, [x10]
+ dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+ dmbutterfly_l v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1] // v14,v15 = t9, v12,v13 = t8
+ dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
+
+ dmbutterfly_l v4, v5, v6, v7, v28, v19, v1.s[1], v1.s[0] // v4,v5 = t12, v6,v7 = t13
+ dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
+ dmbutterfly_l v10, v11, v8, v9, v21, v26, v1.s[2], v1.s[3] // v10,v11 = t11, v8,v9 = t10
+ butterfly_4s_r v4, v27, v16, v27 // v4 = t4, v27 = t0
+ dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
+
+ dmbutterfly_l v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2] // v12,v13 = t14, v14,v15 = t15
+ butterfly_4s_r v5, v20, v31, v20 // v5 = t5, v20 = t1
+ dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
+ dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
+
+ butterfly_4s_r v6, v25, v18, v25 // v6 = t6, v25 = t2
+ butterfly_4s_r v7, v22, v29, v22 // v7 = t7, v22 = t3
+
+ dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.s[2], v0.s[3] // v10,v11 = t13, v8,v9 = t12
+ dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2] // v12,v13 = t14, v14,v15 = t15
+
+ dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
+ dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
+ neg v29.4s, v29.4s // v29 = out[13]
+
+ dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.s[2], v0.s[3] // v10,v11 = t5a, v8,v9 = t4a
+ dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.s[3], v0.s[2] // v12,v13 = t6a, v14,v15 = t7a
+
+ butterfly_4s v2, v6, v27, v25 // v2 = out[0], v6 = t2a
+ butterfly_4s v3, v7, v23, v21 // v3 =-out[1], v7 = t10
+
+ dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
+ neg v19.4s, v19.4s // v19 = out[3]
+ dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
+
+ butterfly_4s v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
+ butterfly_4s v4, v9, v24, v26 // v4 = out[14],v9 = t11
+
+ dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
+ dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
+ dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
+ dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
+
+ neg v31.4s, v5.4s // v31 = out[15]
+ neg v17.4s, v3.4s // v17 = out[1]
+
+ mov v16.16b, v2.16b
+ mov v30.16b, v4.16b
+ ret
+endfunc
+
+// Helper macros; we can't use these expressions directly within
+// e.g. .irp due to the extra concatenation \(). Therefore wrap
+// them in macros to allow using .irp below.
+.macro load i, src, inc
+ ld1 {v\i\().4s}, [\src], \inc
+.endm
+.macro store i, dst, inc
+ st1 {v\i\().4s}, [\dst], \inc
+.endm
+.macro movi_v i, size, imm
+ movi v\i\()\size, \imm
+.endm
+.macro load_clear i, src, inc
+ ld1 {v\i\().4s}, [\src]
+ st1 {v4.4s}, [\src], \inc
+.endm
+
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
+ srshr \coef0, \coef0, #6
+ ld1 {v4.4h}, [x0], x1
+ srshr \coef1, \coef1, #6
+ ld1 {v4.d}[1], [x3], x1
+ srshr \coef2, \coef2, #6
+ ld1 {v5.4h}, [x0], x1
+ srshr \coef3, \coef3, #6
+ uaddw \coef0, \coef0, v4.4h
+ ld1 {v5.d}[1], [x3], x1
+ srshr \coef4, \coef4, #6
+ uaddw2 \coef1, \coef1, v4.8h
+ ld1 {v6.4h}, [x0], x1
+ srshr \coef5, \coef5, #6
+ uaddw \coef2, \coef2, v5.4h
+ ld1 {v6.d}[1], [x3], x1
+ sqxtun v4.4h, \coef0
+ srshr \coef6, \coef6, #6
+ uaddw2 \coef3, \coef3, v5.8h
+ ld1 {v7.4h}, [x0], x1
+ sqxtun2 v4.8h, \coef1
+ srshr \coef7, \coef7, #6
+ uaddw \coef4, \coef4, v6.4h
+ ld1 {v7.d}[1], [x3], x1
+ umin v4.8h, v4.8h, v8.8h
+ sub x0, x0, x1, lsl #2
+ sub x3, x3, x1, lsl #2
+ sqxtun v5.4h, \coef2
+ uaddw2 \coef5, \coef5, v6.8h
+ st1 {v4.4h}, [x0], x1
+ sqxtun2 v5.8h, \coef3
+ uaddw \coef6, \coef6, v7.4h
+ st1 {v4.d}[1], [x3], x1
+ umin v5.8h, v5.8h, v8.8h
+ sqxtun v6.4h, \coef4
+ uaddw2 \coef7, \coef7, v7.8h
+ st1 {v5.4h}, [x0], x1
+ sqxtun2 v6.8h, \coef5
+ st1 {v5.d}[1], [x3], x1
+ umin v6.8h, v6.8h, v8.8h
+ sqxtun v7.4h, \coef6
+ st1 {v6.4h}, [x0], x1
+ sqxtun2 v7.8h, \coef7
+ st1 {v6.d}[1], [x3], x1
+ umin v7.8h, v7.8h, v8.8h
+ st1 {v7.4h}, [x0], x1
+ st1 {v7.d}[1], [x3], x1
+.endm
+
+// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+// transpose into a horizontal 16x4 slice and store.
+// x0 = dst (temp buffer)
+// x1 = slice offset
+// x2 = src
+// x9 = input stride
+.macro itxfm16_1d_funcs txfm
+function \txfm\()16_1d_4x16_pass1_neon
+ mov x14, x30
+
+ movi v4.4s, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load_clear \i, x2, x9
+.endr
+
+ bl \txfm\()16
+
+ // Do four 4x4 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+ // contain the four transposed 4x4 blocks.
+ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
+ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
+ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
+
+ // Store the transposed 4x4 blocks horizontally.
+ cmp x1, #12
+ b.eq 1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+ store \i, x0, #16
+.endr
+ br x14
+1:
+ // Special case: For the last input column (x1 == 12),
+ // which would be stored as the last row in the temp buffer,
+ // don't store the first 4x4 block, but keep it in registers
+ // for the first slice of the second pass (where it is the
+ // last 4x4 block).
+ add x0, x0, #16
+ st1 {v20.4s}, [x0], #16
+ st1 {v24.4s}, [x0], #16
+ st1 {v28.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v21.4s}, [x0], #16
+ st1 {v25.4s}, [x0], #16
+ st1 {v29.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v22.4s}, [x0], #16
+ st1 {v26.4s}, [x0], #16
+ st1 {v30.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v23.4s}, [x0], #16
+ st1 {v27.4s}, [x0], #16
+ st1 {v31.4s}, [x0], #16
+
+ mov v28.16b, v16.16b
+ mov v29.16b, v17.16b
+ mov v30.16b, v18.16b
+ mov v31.16b, v19.16b
+ br x14
+endfunc
+
+// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+// load the destination pixels (from a similar 4x16 slice), add and store back.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x3 = slice offset
+// x9 = temp buffer stride
+function \txfm\()16_1d_4x16_pass2_neon
+ mov x14, x30
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+ load \i, x2, x9
+.endr
+ cbz x3, 1f
+.irp i, 28, 29, 30, 31
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl \txfm\()16
+
+ dup v8.8h, w13
+ load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+ br x14
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+// This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+ .short 0, 10, 38, 89
+endconst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #1
+ b.eq idct16x16_dc_add_neon
+.endif
+ mov x15, x30
+ // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
+.ifnc \txfm1\()_\txfm2,idct_idct
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+.endif
+ stp d8, d9, [sp, #-0x10]!
+
+ sub sp, sp, #1024
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+ movrel x10, idct_coeffs
+.ifnc \txfm1\()_\txfm2,idct_idct
+ movrel x11, iadst16_coeffs
+.endif
+.ifc \txfm1,idct
+ ld1 {v0.8h,v1.8h}, [x10]
+ sxtl v2.4s, v1.4h
+ sxtl2 v3.4s, v1.8h
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+.endif
+ mov x9, #64
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp w3, #10
+ b.le idct16x16_quarter_add_16_neon
+ cmp w3, #38
+ b.le idct16x16_half_add_16_neon
+
+ movrel x12, min_eob_idct_idct_16, 2
+.endif
+
+.irp i, 0, 4, 8, 12
+ add x0, sp, #(\i*64)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+ ldrh w1, [x12], #2
+ cmp w3, w1
+ mov x1, #(16 - \i)/4
+ b.le 1f
+.endif
+.endif
+ mov x1, #\i
+ add x2, x6, #(\i*4)
+ bl \txfm1\()16_1d_4x16_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,iadst_idct
+ ld1 {v0.8h,v1.8h}, [x10]
+ sxtl v2.4s, v1.4h
+ sxtl2 v3.4s, v1.8h
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ b 3f
+1:
+ // Set v28-v31 to zero, for the in-register passthrough of
+ // coefficients to pass 2.
+ movi v28.4s, #0
+ movi v29.4s, #0
+ movi v30.4s, #0
+ movi v31.4s, #0
+2:
+ subs x1, x1, #1
+.rept 4
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
+.endr
+ b.ne 2b
+3:
+.endif
+
+.irp i, 0, 4, 8, 12
+ add x0, x4, #(\i*2)
+ mov x1, x5
+ add x2, sp, #(\i*4)
+ mov x3, #\i
+ bl \txfm2\()16_1d_4x16_pass2_neon
+.endr
+
+ add sp, sp, #1024
+ ldp d8, d9, [sp], 0x10
+.ifnc \txfm1\()_\txfm2,idct_idct
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+.endif
+ br x15
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
+ mov x13, #0x03ff
+ b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
+ mov x13, #0x0fff
+ b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+.endm
+
+itxfm_func16x16 idct, idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct, iadst
+itxfm_func16x16 iadst, iadst
+
+function idct16_1d_4x16_pass1_quarter_neon
+ mov x14, x30
+
+ movi v4.4s, #0
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+
+ bl idct16_quarter
+
+ // Do four 4x4 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+ // contain the four transposed 4x4 blocks.
+ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
+ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
+ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
+
+ // Store the transposed 4x4 blocks horizontally.
+ // The first 4x4 block is kept in registers for the second pass,
+ // store the rest in the temp buffer.
+ add x0, x0, #16
+ st1 {v20.4s}, [x0], #16
+ st1 {v24.4s}, [x0], #16
+ st1 {v28.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v21.4s}, [x0], #16
+ st1 {v25.4s}, [x0], #16
+ st1 {v29.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v22.4s}, [x0], #16
+ st1 {v26.4s}, [x0], #16
+ st1 {v30.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v23.4s}, [x0], #16
+ st1 {v27.4s}, [x0], #16
+ st1 {v31.4s}, [x0], #16
+ br x14
+endfunc
+
+function idct16_1d_4x16_pass2_quarter_neon
+ mov x14, x30
+
+ // Only load the top 4 lines, and only do it for the later slices.
+ // For the first slice, d16-d19 is kept in registers from the first pass.
+ cbz x3, 1f
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl idct16_quarter
+
+ dup v8.8h, w13
+ load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+ br x14
+endfunc
+
+function idct16_1d_4x16_pass1_half_neon
+ mov x14, x30
+
+ movi v4.4s, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+
+ bl idct16_half
+
+ // Do four 4x4 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+ // contain the four transposed 4x4 blocks.
+ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
+ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
+ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
+
+ // Store the transposed 4x4 blocks horizontally.
+ cmp x1, #4
+ b.eq 1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+ store \i, x0, #16
+.endr
+ br x14
+1:
+ // Special case: For the second input column (r1 == 4),
+ // which would be stored as the second row in the temp buffer,
+ // don't store the first 4x4 block, but keep it in registers
+ // for the first slice of the second pass (where it is the
+ // second 4x4 block).
+ add x0, x0, #16
+ st1 {v20.4s}, [x0], #16
+ st1 {v24.4s}, [x0], #16
+ st1 {v28.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v21.4s}, [x0], #16
+ st1 {v25.4s}, [x0], #16
+ st1 {v29.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v22.4s}, [x0], #16
+ st1 {v26.4s}, [x0], #16
+ st1 {v30.4s}, [x0], #16
+ add x0, x0, #16
+ st1 {v23.4s}, [x0], #16
+ st1 {v27.4s}, [x0], #16
+ st1 {v31.4s}, [x0], #16
+
+ mov v20.16b, v16.16b
+ mov v21.16b, v17.16b
+ mov v22.16b, v18.16b
+ mov v23.16b, v19.16b
+ br x14
+endfunc
+
+function idct16_1d_4x16_pass2_half_neon
+ mov x14, x30
+
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+ cbz x3, 1f
+.irp i, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl idct16_half
+
+ dup v8.8h, w13
+ load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+ br x14
+endfunc
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_16_neon
+ add x0, sp, #(0*64)
+ mov x1, #0
+ add x2, x6, #(0*4)
+ bl idct16_1d_4x16_pass1_\size\()_neon
+.ifc \size,half
+ add x0, sp, #(4*64)
+ mov x1, #4
+ add x2, x6, #(4*4)
+ bl idct16_1d_4x16_pass1_\size\()_neon
+.endif
+
+.irp i, 0, 4, 8, 12
+ add x0, x4, #(\i*2)
+ mov x1, x5
+ add x2, sp, #(\i*4)
+ mov x3, #\i
+ bl idct16_1d_4x16_pass2_\size\()_neon
+.endr
+
+ add sp, sp, #1024
+ ldp d8, d9, [sp], 0x10
+ br x15
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+ movrel x4, idct_coeffs
+ ld1 {v0.4h}, [x4]
+ sxtl v0.4s, v0.4h
+
+ movi v1.4h, #0
+
+ ld1 {v2.s}[0], [x2]
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ smull v2.2d, v2.2s, v0.s[0]
+ rshrn v2.2s, v2.2d, #14
+ st1 {v1.s}[0], [x2]
+ dup v2.4s, v2.s[0]
+
+ srshr v0.4s, v2.4s, #6
+
+ mov x3, x0
+ mov x4, #32
+ sub x1, x1, #32
+ dup v31.8h, w13
+1:
+ // Loop to add the constant v0 into all 32x32 outputs
+ subs x4, x4, #1
+ ld1 {v1.8h,v2.8h}, [x0], #32
+ uaddw v16.4s, v0.4s, v1.4h
+ uaddw2 v17.4s, v0.4s, v1.8h
+ ld1 {v3.8h,v4.8h}, [x0], x1
+ uaddw v18.4s, v0.4s, v2.4h
+ uaddw2 v19.4s, v0.4s, v2.8h
+ uaddw v20.4s, v0.4s, v3.4h
+ uaddw2 v21.4s, v0.4s, v3.8h
+ uaddw v22.4s, v0.4s, v4.4h
+ uaddw2 v23.4s, v0.4s, v4.8h
+ sqxtun v1.4h, v16.4s
+ sqxtun2 v1.8h, v17.4s
+ sqxtun v2.4h, v18.4s
+ sqxtun2 v2.8h, v19.4s
+ sqxtun v3.4h, v20.4s
+ sqxtun2 v3.8h, v21.4s
+ sqxtun v4.4h, v22.4s
+ sqxtun2 v4.8h, v23.4s
+ umin v1.8h, v1.8h, v31.8h
+ umin v2.8h, v2.8h, v31.8h
+ st1 {v1.8h,v2.8h}, [x3], #32
+ umin v3.8h, v3.8h, v31.8h
+ umin v4.8h, v4.8h, v31.8h
+ st1 {v3.8h,v4.8h}, [x3], x1
+ b.ne 1b
+
+ ret
+endfunc
+
+.macro idct32_end
+ butterfly_4s v16, v5, v4, v5 // v16 = t16a, v5 = t19a
+ butterfly_4s v17, v20, v23, v20 // v17 = t17, v20 = t18
+ butterfly_4s v18, v6, v7, v6 // v18 = t23a, v6 = t20a
+ butterfly_4s v19, v21, v22, v21 // v19 = t22, v21 = t21
+ butterfly_4s v4, v28, v28, v30 // v4 = t24a, v28 = t27a
+ butterfly_4s v23, v26, v25, v26 // v23 = t25, v26 = t26
+ butterfly_4s v7, v8, v29, v31 // v7 = t31a, v3 = t28a
+ butterfly_4s v22, v27, v24, v27 // v22 = t30, v27 = t29
+
+ dmbutterfly v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
+ dmbutterfly v8, v5, v0.s[2], v0.s[3], v24, v25, v30, v31 // v3 = t19, v5 = t28
+ dmbutterfly v28, v6, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
+ dmbutterfly v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
+
+ butterfly_4s v31, v24, v7, v4 // v31 = t31, v24 = t24
+ butterfly_4s v30, v25, v22, v23 // v30 = t30a, v25 = t25a
+ butterfly_4s_r v23, v16, v16, v18 // v23 = t23, v16 = t16
+ butterfly_4s_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
+ butterfly_4s v18, v21, v27, v21 // v18 = t18, v21 = t21
+ butterfly_4s_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
+ butterfly_4s v29, v26, v20, v26 // v29 = t29, v26 = t26
+ butterfly_4s v19, v20, v8, v6 // v19 = t19a, v20 = t20
+
+ dmbutterfly0 v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27, v20 = t20
+ dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
+ dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22
+ dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
+ ret
+.endm
+
+function idct32_odd
+ dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+ dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+ dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+ dmbutterfly v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+ dmbutterfly v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+ dmbutterfly v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+ dmbutterfly v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+ dmbutterfly v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+ butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17
+ butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18
+ butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21
+ butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22
+ butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25
+ butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26
+ butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30
+ butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+ dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
+ dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+ dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
+ dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+ idct32_end
+endfunc
+
+function idct32_odd_half
+ dmbutterfly_h1 v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+ dmbutterfly_h2 v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+ dmbutterfly_h1 v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+ dmbutterfly_h2 v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+ dmbutterfly_h1 v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+ dmbutterfly_h2 v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+ dmbutterfly_h1 v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+ dmbutterfly_h2 v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+ butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17
+ butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18
+ butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21
+ butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22
+ butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25
+ butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26
+ butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30
+ butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+ dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
+ dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+ dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
+ dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+ idct32_end
+endfunc
+
+function idct32_odd_quarter
+ dsmull_h v4, v5, v16, v10.s[0]
+ dsmull_h v28, v29, v19, v11.s[3]
+ dsmull_h v30, v31, v16, v10.s[1]
+ dsmull_h v22, v23, v17, v13.s[2]
+ dsmull_h v7, v6, v17, v13.s[3]
+ dsmull_h v26, v27, v19, v11.s[2]
+ dsmull_h v20, v21, v18, v12.s[0]
+ dsmull_h v24, v25, v18, v12.s[1]
+
+ neg v28.2d, v28.2d
+ neg v29.2d, v29.2d
+ neg v7.2d, v7.2d
+ neg v6.2d, v6.2d
+
+ drshrn_h v4, v4, v5, #14
+ drshrn_h v5, v28, v29, #14
+ drshrn_h v29, v30, v31, #14
+ drshrn_h v28, v22, v23, #14
+ drshrn_h v7, v7, v6, #14
+ drshrn_h v31, v26, v27, #14
+ drshrn_h v6, v20, v21, #14
+ drshrn_h v30, v24, v25, #14
+
+ dmbutterfly_l v16, v17, v18, v19, v29, v4, v1.s[0], v1.s[1]
+ dmbutterfly_l v27, v26, v20, v21, v31, v5, v1.s[0], v1.s[1]
+ drshrn_h v23, v16, v17, #14
+ drshrn_h v24, v18, v19, #14
+ neg v20.2d, v20.2d
+ neg v21.2d, v21.2d
+ drshrn_h v27, v27, v26, #14
+ drshrn_h v20, v20, v21, #14
+ dmbutterfly_l v16, v17, v18, v19, v30, v6, v1.s[2], v1.s[3]
+ drshrn_h v21, v16, v17, #14
+ drshrn_h v26, v18, v19, #14
+ dmbutterfly_l v16, v17, v18, v19, v28, v7, v1.s[2], v1.s[3]
+ drshrn_h v25, v16, v17, #14
+ neg v18.2d, v18.2d
+ neg v19.2d, v19.2d
+ drshrn_h v22, v18, v19, #14
+
+ idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
+// The 32-point IDCT can be decomposed into two 16-point IDCTs;
+// a normal IDCT16 with every other input component (the even ones, with
+// each output written twice), followed by a separate 16-point IDCT
+// of the odd inputs, added/subtracted onto the outputs of the first idct16.
+// x0 = dst (temp buffer)
+// x1 = unused
+// x2 = src
+// x9 = double input stride
+function idct32_1d_4x32_pass1\suffix\()_neon
+ mov x14, x30
+
+ movi v4.4s, #0
+
+ // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+.endif
+
+ bl idct16\suffix
+
+ // Do four 4x4 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+ // contain the four transposed 4x4 blocks.
+ transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
+ transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
+ transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
+
+ // Store the registers a, b, c, d horizontally, followed by the
+ // same registers d, c, b, a mirrored.
+.macro store_rev a, b, c, d
+ // There's no rev128 instruction, but we reverse each 64 bit
+ // half, and then flip them using an ext with 8 bytes offset.
+ rev64 v7.4s, \d
+ st1 {\a}, [x0], #16
+ ext v7.16b, v7.16b, v7.16b, #8
+ st1 {\b}, [x0], #16
+ rev64 v6.4s, \c
+ st1 {\c}, [x0], #16
+ ext v6.16b, v6.16b, v6.16b, #8
+ st1 {\d}, [x0], #16
+ rev64 v5.4s, \b
+ st1 {v7.4s}, [x0], #16
+ ext v5.16b, v5.16b, v5.16b, #8
+ st1 {v6.4s}, [x0], #16
+ rev64 v4.4s, \a
+ st1 {v5.4s}, [x0], #16
+ ext v4.16b, v4.16b, v4.16b, #8
+ st1 {v4.4s}, [x0], #16
+.endm
+ store_rev v16.4s, v20.4s, v24.4s, v28.4s
+ store_rev v17.4s, v21.4s, v25.4s, v29.4s
+ store_rev v18.4s, v22.4s, v26.4s, v30.4s
+ store_rev v19.4s, v23.4s, v27.4s, v31.4s
+ sub x0, x0, #512
+.purgem store_rev
+
+ // Move x2 back to the start of the input, and move
+ // to the first odd row
+.ifb \suffix
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+ sub x2, x2, x9, lsl #3
+.endif
+ add x2, x2, #128
+
+ movi v4.4s, #0
+ // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+.endif
+
+ bl idct32_odd\suffix
+
+ transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7
+ transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7
+ transpose_4x4s v23, v22, v21, v20, v4, v5, v6, v7
+ transpose_4x4s v19, v18, v17, v16, v4, v5, v6, v7
+
+ // Store the registers a, b, c, d horizontally,
+ // adding into the output first, and the mirrored,
+ // subtracted from the output.
+.macro store_rev a, b, c, d, a16b, b16b
+ ld1 {v4.4s}, [x0]
+ rev64 v9.4s, \d
+ add v4.4s, v4.4s, \a
+ st1 {v4.4s}, [x0], #16
+ rev64 v8.4s, \c
+ ld1 {v4.4s}, [x0]
+ ext v9.16b, v9.16b, v9.16b, #8
+ add v4.4s, v4.4s, \b
+ st1 {v4.4s}, [x0], #16
+ ext v8.16b, v8.16b, v8.16b, #8
+ ld1 {v4.4s}, [x0]
+ rev64 \b, \b
+ add v4.4s, v4.4s, \c
+ st1 {v4.4s}, [x0], #16
+ rev64 \a, \a
+ ld1 {v4.4s}, [x0]
+ ext \b16b, \b16b, \b16b, #8
+ add v4.4s, v4.4s, \d
+ st1 {v4.4s}, [x0], #16
+ ext \a16b, \a16b, \a16b, #8
+ ld1 {v4.4s}, [x0]
+ sub v4.4s, v4.4s, v9.4s
+ st1 {v4.4s}, [x0], #16
+ ld1 {v4.4s}, [x0]
+ sub v4.4s, v4.4s, v8.4s
+ st1 {v4.4s}, [x0], #16
+ ld1 {v4.4s}, [x0]
+ sub v4.4s, v4.4s, \b
+ st1 {v4.4s}, [x0], #16
+ ld1 {v4.4s}, [x0]
+ sub v4.4s, v4.4s, \a
+ st1 {v4.4s}, [x0], #16
+.endm
+
+ store_rev v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
+ store_rev v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
+ store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
+ store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
+.purgem store_rev
+ br x14
+endfunc
+
+// This is mostly the same as 4x32_pass1, but without the transpose,
+// and use the source as temp buffer between the two idct passes, and
+// add into the destination.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
+function idct32_1d_4x32_pass2\suffix\()_neon
+ mov x14, x30
+
+ // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #3
+.endif
+
+ bl idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ store \i, x2, x9
+.endr
+
+ sub x2, x2, x9, lsl #4
+ add x2, x2, #128
+
+ // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+ sub x2, x2, x9, lsl #3
+.endif
+ sub x2, x2, #128
+
+ bl idct32_odd\suffix
+
+.macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
+ ld1 {v4.4s}, [x2], x9
+ ld1 {v5.4s}, [x2], x9
+ add v4.4s, v4.4s, \a
+ ld1 {v6.4s}, [x2], x9
+ add v5.4s, v5.4s, \b
+ ld1 {v7.4s}, [x2], x9
+ add v6.4s, v6.4s, \c
+ add v7.4s, v7.4s, \d
+.else
+ ld1 {v4.4s}, [x2], x7
+ ld1 {v5.4s}, [x2], x7
+ sub v4.4s, v4.4s, \a
+ ld1 {v6.4s}, [x2], x7
+ sub v5.4s, v5.4s, \b
+ ld1 {v7.4s}, [x2], x7
+ sub v6.4s, v6.4s, \c
+ sub v7.4s, v7.4s, \d
+.endif
+ ld1 {v8.4h}, [x0], x1
+ ld1 {v8.d}[1], [x0], x1
+ srshr v4.4s, v4.4s, #6
+ ld1 {v9.4h}, [x0], x1
+ srshr v5.4s, v5.4s, #6
+ uaddw v4.4s, v4.4s, v8.4h
+ ld1 {v9.d}[1], [x0], x1
+ srshr v6.4s, v6.4s, #6
+ uaddw2 v5.4s, v5.4s, v8.8h
+ srshr v7.4s, v7.4s, #6
+ sub x0, x0, x1, lsl #2
+ uaddw v6.4s, v6.4s, v9.4h
+ sqxtun v4.4h, v4.4s
+ uaddw2 v7.4s, v7.4s, v9.8h
+ sqxtun2 v4.8h, v5.4s
+ umin v4.8h, v4.8h, v15.8h
+ st1 {v4.4h}, [x0], x1
+ sqxtun v5.4h, v6.4s
+ st1 {v4.d}[1], [x0], x1
+ sqxtun2 v5.8h, v7.4s
+ umin v5.8h, v5.8h, v15.8h
+ st1 {v5.4h}, [x0], x1
+ st1 {v5.d}[1], [x0], x1
+.endm
+ load_acc_store v31.4s, v30.4s, v29.4s, v28.4s
+ load_acc_store v27.4s, v26.4s, v25.4s, v24.4s
+ load_acc_store v23.4s, v22.4s, v21.4s, v20.4s
+ load_acc_store v19.4s, v18.4s, v17.4s, v16.4s
+ sub x2, x2, x9
+ load_acc_store v16.4s, v17.4s, v18.4s, v19.4s, 1
+ load_acc_store v20.4s, v21.4s, v22.4s, v23.4s, 1
+ load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1
+ load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1
+.purgem load_acc_store
+ br x14
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+ .short 0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
+function vp9_idct_idct_32x32_add_16_neon
+ cmp w3, #1
+ b.eq idct32x32_dc_add_neon
+
+ movrel x10, idct_coeffs
+
+ mov x15, x30
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+
+ sub sp, sp, #4096
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+ // Double stride of the input, since we only read every other line
+ mov x9, #256
+ neg x7, x9
+
+ ld1 {v0.8h,v1.8h}, [x10], #32
+ sxtl v2.4s, v1.4h
+ sxtl2 v3.4s, v1.8h
+ sxtl2 v1.4s, v0.8h
+ sxtl v0.4s, v0.4h
+ ld1 {v10.8h,v11.8h}, [x10]
+ sxtl v12.4s, v11.4h
+ sxtl2 v13.4s, v11.8h
+ sxtl2 v11.4s, v10.8h
+ sxtl v10.4s, v10.4h
+
+ dup v15.8h, w13
+
+ cmp w3, #34
+ b.le idct32x32_quarter_add_16_neon
+ cmp w3, #135
+ b.le idct32x32_half_add_16_neon
+
+ movrel x12, min_eob_idct_idct_32, 2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x0, sp, #(\i*128)
+.if \i > 0
+ ldrh w1, [x12], #2
+ cmp w3, w1
+ mov x1, #(32 - \i)/4
+ b.le 1f
+.endif
+ add x2, x6, #(\i*4)
+ bl idct32_1d_4x32_pass1_neon
+.endr
+ b 3f
+
+1:
+ // Write zeros to the temp buffer for pass 2
+ movi v16.4s, #0
+ movi v17.4s, #0
+ movi v18.4s, #0
+ movi v19.4s, #0
+2:
+ subs x1, x1, #1
+.rept 4
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+.endr
+ b.ne 2b
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x0, x4, #(\i*2)
+ mov x1, x5
+ add x2, sp, #(\i*4)
+ bl idct32_1d_4x32_pass2_neon
+.endr
+
+ add sp, sp, #4096
+ ldp d14, d15, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d8, d9, [sp], 0x10
+
+ br x15
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_10_neon, export=1
+ mov x13, #0x03ff
+ b vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_12_neon, export=1
+ mov x13, #0x0fff
+ b vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_16_neon
+.irp i, 0, 4
+ add x0, sp, #(\i*128)
+.ifc \size,quarter
+.if \i == 4
+ cmp w3, #9
+ b.le 1f
+.endif
+.endif
+ add x2, x6, #(\i*4)
+ bl idct32_1d_4x32_pass1_\size\()_neon
+.endr
+
+.ifc \size,half
+.irp i, 8, 12
+ add x0, sp, #(\i*128)
+.if \i == 12
+ cmp w3, #70
+ b.le 1f
+.endif
+ add x2, x6, #(\i*4)
+ bl idct32_1d_4x32_pass1_\size\()_neon
+.endr
+.endif
+ b 3f
+
+1:
+ // Write zeros to the temp buffer for pass 2
+ movi v16.4s, #0
+ movi v17.4s, #0
+ movi v18.4s, #0
+ movi v19.4s, #0
+
+.rept 4
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+ st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+.endr
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x0, x4, #(\i*2)
+ mov x1, x5
+ add x2, sp, #(\i*4)
+ bl idct32_1d_4x32_pass2_\size\()_neon
+.endr
+
+ add sp, sp, #4096
+ ldp d14, d15, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d8, d9, [sp], 0x10
+
+ br x15
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 3ffb418963..99413b0f70 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2016 Google Inc.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
new file mode 100644
index 0000000000..9075f3d406
--- /dev/null
+++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+
+.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().8h, \r0\().8h, \r1\().8h
+ trn2 \t5\().8h, \r0\().8h, \r1\().8h
+ trn1 \t6\().8h, \r2\().8h, \r3\().8h
+ trn2 \t7\().8h, \r2\().8h, \r3\().8h
+
+ trn1 \r0\().4s, \t4\().4s, \t6\().4s
+ trn2 \r2\().4s, \t4\().4s, \t6\().4s
+ trn1 \r1\().4s, \t5\().4s, \t7\().4s
+ trn2 \r3\().4s, \t5\().4s, \t7\().4s
+.endm
+
+// The input to and output from this macro is in the registers v16-v31,
+// and v0-v7 are used as scratch registers.
+// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
+// Depending on the width of the loop filter, we either use v16-v19
+// and v28-v31 as temp registers, or v8-v15.
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+ dup v0.8h, w2 // E
+ dup v2.8h, w3 // I
+ dup v3.8h, w4 // H
+
+ uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2)
+ uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1)
+ uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0)
+ uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1)
+ uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2)
+ uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3)
+ umax v4.8h, v4.8h, v5.8h
+ umax v5.8h, v6.8h, v7.8h
+ umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
+ uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0)
+ umax v4.8h, v4.8h, v5.8h
+ add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2
+ uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1)
+ umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3))
+ ushr v5.8h, v5.8h, #1
+ cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I
+ add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+ cmhs v6.8h, v0.8h, v6.8h
+ and v4.16b, v4.16b, v6.16b // fm
+
+ // If no pixels need filtering, just exit as soon as possible
+ mov x11, v4.d[0]
+ mov x12, v4.d[1]
+ adds x11, x11, x12
+ b.ne 1f
+ br x10
+1:
+
+.if \wd >= 8
+ dup v0.8h, w5
+
+ uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
+ uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
+ uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0)
+ uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0)
+ uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0)
+ uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0)
+ umax v6.8h, v6.8h, v2.8h
+ umax v1.8h, v1.8h, \tmp1\().8h
+ umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h
+.if \wd == 16
+ uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0)
+ umax v6.8h, v6.8h, v1.8h
+ uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0)
+ umax v6.8h, v6.8h, \tmp2\().8h
+ uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0)
+ cmhs v6.8h, v0.8h, v6.8h // flat8in
+ uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0)
+ and v6.16b, v6.16b, v4.16b // flat8in && fm
+ uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0)
+ bic v4.16b, v4.16b, v6.16b // fm && !flat8in
+ uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0)
+ uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0)
+ uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0)
+
+ umax v7.8h, v7.8h, v2.8h
+ umax v1.8h, v1.8h, v8.8h
+ umax v9.8h, v9.8h, v10.8h
+ umax v11.8h, v11.8h, v12.8h
+ // The rest of the calculation of flat8out is interleaved below
+.else
+ // The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+ // Calculate the normal inner loop filter for 2 or 4 pixels
+ uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0)
+.if \wd == 16
+ umax v7.8h, v7.8h, v1.8h
+ umax v9.8h, v9.8h, v11.8h
+.elseif \wd == 8
+ umax v6.8h, v6.8h, v1.8h
+.endif
+ uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
+.if \wd == 16
+ umax v7.8h, v7.8h, v9.8h
+.elseif \wd == 8
+ umax v6.8h, v6.8h, \tmp2\().8h
+.endif
+ dup \tmp2\().8h, w6 // left shift for saturation
+ sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1
+ neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation
+ umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
+ sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0
+ movi \tmp5\().8h, #3
+.if \wd == 8
+ cmhs v6.8h, v0.8h, v6.8h // flat8in
+.endif
+ cmhs v5.8h, v3.8h, v5.8h // !hev
+.if \wd == 8
+ and v6.16b, v6.16b, v4.16b // flat8in && fm
+.endif
+ sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
+.if \wd == 16
+ cmhs v7.8h, v0.8h, v7.8h // flat8out
+.elseif \wd == 8
+ bic v4.16b, v4.16b, v6.16b // fm && !flat8in
+.endif
+ and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in
+.if \wd == 16
+ and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm
+.endif
+ sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+ mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0)
+ bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0
+ movi v2.8h, #4
+ add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+ movi v3.8h, #3
+ sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h
+ movi \tmp5\().8h, #0
+ sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+ dup \tmp6\().8h, w7 // max pixel value
+.if \wd == 16
+ bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out
+.endif
+
+ ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1
+
+ add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4
+ add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3
+ smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+ smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+ sshr \tmp3\().8h, \tmp3\().8h, #3 // f1
+ sshr \tmp4\().8h, \tmp4\().8h, #3 // f2
+
+ add v0.8h, v23.8h, \tmp4\().8h // p0 + f2
+ sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1
+ smin v0.8h, v0.8h, \tmp6\().8h
+ smin v2.8h, v2.8h, \tmp6\().8h
+ srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1
+ smax v0.8h, v0.8h, \tmp5\().8h // out p0
+ smax v2.8h, v2.8h, \tmp5\().8h // out q0
+ bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in)
+ bit v24.16b, v2.16b, v4.16b
+
+ add v0.8h, v22.8h, \tmp3\().8h // p1 + f
+ sub v2.8h, v25.8h, \tmp3\().8h // q1 - f
+.if \wd >= 8
+ mov x11, v6.d[0]
+.endif
+ smin v0.8h, v0.8h, \tmp6\().8h
+ smin v2.8h, v2.8h, \tmp6\().8h
+.if \wd >= 8
+ mov x12, v6.d[1]
+.endif
+ smax v0.8h, v0.8h, \tmp5\().8h // out p1
+ smax v2.8h, v2.8h, \tmp5\().8h // out q1
+.if \wd >= 8
+ adds x11, x11, x12
+.endif
+ bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in)
+ bit v25.16b, v2.16b, v5.16b
+
+ // If no pixels need flat8in, jump to flat8out
+ // (or to a writeout of the inner 4 pixels, for wd=8)
+.if \wd >= 8
+.if \wd == 16
+ b.eq 6f
+.else
+ b.ne 1f
+ br x13
+1:
+.endif
+
+ // flat8in
+ add \tmp1\().8h, v20.8h, v21.8h
+ add \tmp3\().8h, v22.8h, v25.8h
+ add \tmp5\().8h, v20.8h, v22.8h
+ add \tmp7\().8h, v23.8h, v26.8h
+ add v0.8h, \tmp1\().8h, \tmp1\().8h
+ add v0.8h, v0.8h, v23.8h
+ add v0.8h, v0.8h, v24.8h
+ add v0.8h, v0.8h, \tmp5\().8h
+ sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+ sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+ urshr v2.8h, v0.8h, #3 // out p2
+
+ add v0.8h, v0.8h, \tmp3\().8h
+ add \tmp1\().8h, v20.8h, v23.8h
+ add \tmp3\().8h, v24.8h, v27.8h
+ urshr v3.8h, v0.8h, #3 // out p1
+
+ add v0.8h, v0.8h, \tmp7\().8h
+ sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+ add \tmp5\().8h, v21.8h, v24.8h
+ add \tmp7\().8h, v25.8h, v27.8h
+ urshr v4.8h, v0.8h, #3 // out p0
+
+ add v0.8h, v0.8h, \tmp3\().8h
+ sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+ add \tmp1\().8h, v22.8h, v25.8h
+ add \tmp3\().8h, v26.8h, v27.8h
+ urshr v5.8h, v0.8h, #3 // out q0
+
+ add v0.8h, v0.8h, \tmp7\().8h
+ sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+ urshr \tmp5\().8h, v0.8h, #3 // out q1
+
+ add v0.8h, v0.8h, \tmp3\().8h
+ // The output here is written back into the input registers. This doesn't
+ // matter for the flat8part below, since we only update those pixels
+ // which won't be touched below.
+ bit v21.16b, v2.16b, v6.16b
+ bit v22.16b, v3.16b, v6.16b
+ bit v23.16b, v4.16b, v6.16b
+ urshr \tmp6\().8h, v0.8h, #3 // out q2
+ bit v24.16b, v5.16b, v6.16b
+ bit v25.16b, \tmp5\().16b, v6.16b
+ bit v26.16b, \tmp6\().16b, v6.16b
+.endif
+.if \wd == 16
+6:
+ orr v2.16b, v6.16b, v7.16b
+ mov x11, v2.d[0]
+ mov x12, v2.d[1]
+ adds x11, x11, x12
+ b.ne 1f
+ // If no pixels needed flat8in nor flat8out, jump to a
+ // writeout of the inner 4 pixels
+ br x14
+1:
+
+ mov x11, v7.d[0]
+ mov x12, v7.d[1]
+ adds x11, x11, x12
+ b.ne 1f
+ // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+ br x15
+
+1:
+ // flat8out
+ // This writes all outputs into v2-v17 (skipping v6 and v16).
+ // If this part is skipped, the output is read from v21-v26 (which is the input
+ // to this section).
+ shl v0.8h, v16.8h, #3 // 8 * v16
+ sub v0.8h, v0.8h, v16.8h // 7 * v16
+ add v0.8h, v0.8h, v17.8h
+ add v8.8h, v17.8h, v18.8h
+ add v10.8h, v19.8h, v20.8h
+ add v0.8h, v0.8h, v8.8h
+ add v8.8h, v16.8h, v17.8h
+ add v12.8h, v21.8h, v22.8h
+ add v0.8h, v0.8h, v10.8h
+ add v10.8h, v18.8h, v25.8h
+ add v14.8h, v23.8h, v24.8h
+ sub v10.8h, v10.8h, v8.8h
+ add v0.8h, v0.8h, v12.8h
+ add v0.8h, v0.8h, v14.8h
+ add v12.8h, v16.8h, v18.8h
+ add v14.8h, v19.8h, v26.8h
+ urshr v2.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v10.8h
+ add v8.8h, v16.8h, v19.8h
+ add v10.8h, v20.8h, v27.8h
+ sub v14.8h, v14.8h, v12.8h
+ bif v2.16b, v17.16b, v7.16b
+ urshr v3.8h , v0.8h, #4
+
+ add v0.8h, v0.8h, v14.8h
+ add v12.8h, v16.8h, v20.8h
+ add v14.8h, v21.8h, v28.8h
+ sub v10.8h, v10.8h, v8.8h
+ bif v3.16b, v18.16b, v7.16b
+ urshr v4.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v10.8h
+ add v8.8h, v16.8h, v21.8h
+ add v10.8h, v22.8h, v29.8h
+ sub v14.8h, v14.8h, v12.8h
+ bif v4.16b, v19.16b, v7.16b
+ urshr v5.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v14.8h
+ add v12.8h, v16.8h, v22.8h
+ add v14.8h, v23.8h, v30.8h
+ sub v10.8h, v10.8h, v8.8h
+ bif v5.16b, v20.16b, v7.16b
+ urshr v6.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v10.8h
+ add v10.8h, v16.8h, v23.8h
+ sub v14.8h, v14.8h, v12.8h
+ add v12.8h, v24.8h, v31.8h
+ bif v6.16b, v21.16b, v7.16b
+ urshr v8.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v14.8h
+ sub v10.8h, v12.8h, v10.8h
+ add v12.8h, v17.8h, v24.8h
+ add v14.8h, v25.8h, v31.8h
+ bif v8.16b, v22.16b, v7.16b
+ urshr v9.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v10.8h
+ sub v14.8h, v14.8h, v12.8h
+ add v12.8h, v26.8h, v31.8h
+ bif v9.16b, v23.16b, v7.16b
+ urshr v10.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v14.8h
+ add v14.8h, v18.8h, v25.8h
+ add v18.8h, v19.8h, v26.8h
+ sub v12.8h, v12.8h, v14.8h
+ add v14.8h, v27.8h, v31.8h
+ bif v10.16b, v24.16b, v7.16b
+ urshr v11.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v12.8h
+ add v12.8h, v20.8h, v27.8h
+ sub v14.8h, v14.8h, v18.8h
+ add v18.8h, v28.8h, v31.8h
+ bif v11.16b, v25.16b, v7.16b
+ sub v18.8h, v18.8h, v12.8h
+ urshr v12.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v14.8h
+ add v14.8h, v21.8h, v28.8h
+ add v20.8h, v29.8h, v31.8h
+ bif v12.16b, v26.16b, v7.16b
+ urshr v13.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v18.8h
+ sub v20.8h, v20.8h, v14.8h
+ add v18.8h, v22.8h, v29.8h
+ add v22.8h, v30.8h, v31.8h
+ bif v13.16b, v27.16b, v7.16b
+ urshr v14.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v20.8h
+ sub v22.8h, v22.8h, v18.8h
+ bif v14.16b, v28.16b, v7.16b
+ urshr v15.8h, v0.8h, #4
+
+ add v0.8h, v0.8h, v22.8h
+ bif v15.16b, v29.16b, v7.16b
+ urshr v17.8h, v0.8h, #4
+ bif v17.16b, v30.16b, v7.16b
+.endif
+.endm
+
+// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
+// while we need those for inputs/outputs in wd=16 and use v8-v15
+// for temp registers there instead.
+function vp9_loop_filter_4
+ loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31
+ ret
+endfunc
+
+function vp9_loop_filter_8
+ loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31
+ ret
+endfunc
+
+function vp9_loop_filter_16
+ loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15
+ ret
+endfunc
+
+.macro loop_filter_4
+ bl vp9_loop_filter_4
+.endm
+
+.macro loop_filter_8
+ // calculate alternative 'return' targets
+ adr x13, 6f
+ bl vp9_loop_filter_8
+.endm
+
+.macro loop_filter_16
+ // calculate alternative 'return' targets
+ adr x14, 7f
+ adr x15, 8f
+ bl vp9_loop_filter_16
+.endm
+
+
+// The public functions in this file have got the following signature:
+// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+.macro bpp_frontend func, bpp, push
+function ff_\func\()_\bpp\()_neon, export=1
+.if \push
+ mov x16, x30
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+.endif
+ lsl w2, w2, #\bpp - 8
+ lsl w3, w3, #\bpp - 8
+ lsl w4, w4, #\bpp - 8
+ mov x5, #1 << (\bpp - 8)
+ mov x6, #16 - \bpp
+ mov x7, #((1 << \bpp) - 1)
+.if \push
+ bl \func\()_16_neon
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+ br x16
+.else
+ b \func\()_16_neon
+.endif
+endfunc
+.endm
+
+.macro bpp_frontends func, push=0
+ bpp_frontend \func, 10, \push
+ bpp_frontend \func, 12, \push
+.endm
+
+.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
+function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
+ mov x16, x30
+.if \push
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+.endif
+ lsl w2, w2, #\bpp - 8
+ lsl w3, w3, #\bpp - 8
+ lsl w4, w4, #\bpp - 8
+ mov x5, #1 << (\bpp - 8)
+ mov x6, #16 - \bpp
+ mov x7, #((1 << \bpp) - 1)
+ bl \func\()_\int_suffix\()_16_neon
+.ifc \dir,h
+ add x0, x0, x1, lsl #3
+.else
+ add x0, x0, #16
+.endif
+ bl \func\()_\int_suffix\()_16_neon
+.if \push
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+.endif
+ br x16
+endfunc
+.endm
+
+.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
+ bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
+ bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
+.endm
+
+.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
+function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
+ mov x16, x30
+ lsr w8, w2, #8
+ lsr w14, w3, #8
+ lsr w15, w4, #8
+ and w2, w2, #0xff
+ and w3, w3, #0xff
+ and w4, w4, #0xff
+ lsl w2, w2, #\bpp - 8
+ lsl w3, w3, #\bpp - 8
+ lsl w4, w4, #\bpp - 8
+ mov x5, #1 << (\bpp - 8)
+ mov x6, #16 - \bpp
+ mov x7, #((1 << \bpp) - 1)
+ bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
+.ifc \dir,h
+ add x0, x0, x1, lsl #3
+.else
+ add x0, x0, #16
+.endif
+ lsl w2, w8, #\bpp - 8
+ lsl w3, w14, #\bpp - 8
+ lsl w4, w15, #\bpp - 8
+ bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
+ br x16
+endfunc
+.endm
+
+.macro bpp_frontends_mix2 wd1, wd2
+ bpp_frontend_mix2 \wd1, \wd2, v, 10
+ bpp_frontend_mix2 \wd1, \wd2, v, 12
+ bpp_frontend_mix2 \wd1, \wd2, h, 10
+ bpp_frontend_mix2 \wd1, \wd2, h, 12
+.endm
+
+function vp9_loop_filter_v_4_8_16_neon
+ mov x10, x30
+ sub x9, x0, x1, lsl #2
+ ld1 {v20.8h}, [x9], x1 // p3
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v21.8h}, [x9], x1 // p2
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v22.8h}, [x9], x1 // p1
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v23.8h}, [x9], x1 // p0
+ ld1 {v27.8h}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #2
+ sub x9, x9, x1, lsl #1
+
+ loop_filter_4
+
+ st1 {v22.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ sub x0, x0, x1, lsl #1
+
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_4_8
+
+function vp9_loop_filter_h_4_8_16_neon
+ mov x10, x30
+ sub x9, x0, #8
+ add x0, x9, x1, lsl #2
+ ld1 {v20.8h}, [x9], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v21.8h}, [x9], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v22.8h}, [x9], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v23.8h}, [x9], x1
+ ld1 {v27.8h}, [x0], x1
+
+ sub x9, x9, x1, lsl #2
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+
+ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ loop_filter_4
+
+ // Move x9 forward by 2 pixels; we don't need to rewrite the
+ // outermost 2 pixels since they aren't changed.
+ add x9, x9, #4
+ add x0, x9, x1, lsl #2
+
+ // We only will write the mid 4 pixels back; after the loop filter,
+ // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
+ // We need to transpose them to columns, done with a 4x8 transpose
+ // (which in practice is two 4x4 transposes of the two 4x4 halves
+ // of the 8x4 pixels; into 4x8 pixels).
+ transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.d}[0], [x9], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x9], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x9], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x9], x1
+ st1 {v25.d}[1], [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #4
+
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_4_8
+
+function vp9_loop_filter_v_8_8_16_neon
+ mov x10, x30
+ sub x9, x0, x1, lsl #2
+ ld1 {v20.8h}, [x9], x1 // p3
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v21.8h}, [x9], x1 // p2
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v22.8h}, [x9], x1 // p1
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v23.8h}, [x9], x1 // p0
+ ld1 {v27.8h}, [x0], x1 // q3
+ sub x9, x9, x1, lsl #2
+ sub x0, x0, x1, lsl #2
+ add x9, x9, x1
+
+ loop_filter_8
+
+ st1 {v21.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v22.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v26.8h}, [x0], x1
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+
+ br x10
+6:
+ sub x9, x0, x1, lsl #1
+ st1 {v22.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ sub x0, x0, x1, lsl #1
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_8_8
+
+function vp9_loop_filter_h_8_8_16_neon
+ mov x10, x30
+ sub x9, x0, #8
+ add x0, x9, x1, lsl #2
+ ld1 {v20.8h}, [x9], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v21.8h}, [x9], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v22.8h}, [x9], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v23.8h}, [x9], x1
+ ld1 {v27.8h}, [x0], x1
+
+ sub x9, x9, x1, lsl #2
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+
+ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ loop_filter_8
+
+ add x0, x9, x1, lsl #2
+
+ // Even though only 6 pixels per row have been changed, we write the
+ // full 8 pixel registers.
+ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ st1 {v20.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v21.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v22.8h}, [x9], x1
+ st1 {v26.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v27.8h}, [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+
+ br x10
+6:
+ // If we didn't need to do the flat8in part, we use the same writeback
+ // as in loop_filter_h_4_8.
+ add x9, x9, #4
+ add x0, x9, x1, lsl #2
+ transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.d}[0], [x9], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x9], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x9], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x9], x1
+ st1 {v25.d}[1], [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #4
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_8_8
+
+bpp_frontends_mix2 4, 4
+bpp_frontends_mix2 4, 8
+bpp_frontends_mix2 8, 4
+bpp_frontends_mix2 8, 8
+
+function vp9_loop_filter_v_16_8_16_neon
+ mov x10, x30
+ sub x9, x0, x1, lsl #3
+ ld1 {v16.8h}, [x9], x1 // p7
+ ld1 {v24.8h}, [x0], x1 // q0
+ ld1 {v17.8h}, [x9], x1 // p6
+ ld1 {v25.8h}, [x0], x1 // q1
+ ld1 {v18.8h}, [x9], x1 // p5
+ ld1 {v26.8h}, [x0], x1 // q2
+ ld1 {v19.8h}, [x9], x1 // p4
+ ld1 {v27.8h}, [x0], x1 // q3
+ ld1 {v20.8h}, [x9], x1 // p3
+ ld1 {v28.8h}, [x0], x1 // q4
+ ld1 {v21.8h}, [x9], x1 // p2
+ ld1 {v29.8h}, [x0], x1 // q5
+ ld1 {v22.8h}, [x9], x1 // p1
+ ld1 {v30.8h}, [x0], x1 // q6
+ ld1 {v23.8h}, [x9], x1 // p0
+ ld1 {v31.8h}, [x0], x1 // q7
+ sub x9, x9, x1, lsl #3
+ sub x0, x0, x1, lsl #3
+ add x9, x9, x1
+
+ loop_filter_16
+
+ // If we did the flat8out part, we get the output in
+ // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
+ // store v2-v9 there, and v10-v17 into x0.
+ st1 {v2.8h}, [x9], x1
+ st1 {v10.8h}, [x0], x1
+ st1 {v3.8h}, [x9], x1
+ st1 {v11.8h}, [x0], x1
+ st1 {v4.8h}, [x9], x1
+ st1 {v12.8h}, [x0], x1
+ st1 {v5.8h}, [x9], x1
+ st1 {v13.8h}, [x0], x1
+ st1 {v6.8h}, [x9], x1
+ st1 {v14.8h}, [x0], x1
+ st1 {v8.8h}, [x9], x1
+ st1 {v15.8h}, [x0], x1
+ st1 {v9.8h}, [x9], x1
+ st1 {v17.8h}, [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, x1
+
+ br x10
+8:
+ add x9, x9, x1, lsl #2
+ // If we didn't do the flat8out part, the output is left in the
+ // input registers.
+ st1 {v21.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v22.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v26.8h}, [x0], x1
+ sub x0, x0, x1, lsl #1
+ sub x0, x0, x1
+ br x10
+7:
+ sub x9, x0, x1, lsl #1
+ st1 {v22.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ sub x0, x0, x1, lsl #1
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
+
+function vp9_loop_filter_h_16_8_16_neon
+ mov x10, x30
+ sub x9, x0, #16
+ ld1 {v16.8h}, [x9], x1
+ ld1 {v24.8h}, [x0], x1
+ ld1 {v17.8h}, [x9], x1
+ ld1 {v25.8h}, [x0], x1
+ ld1 {v18.8h}, [x9], x1
+ ld1 {v26.8h}, [x0], x1
+ ld1 {v19.8h}, [x9], x1
+ ld1 {v27.8h}, [x0], x1
+ ld1 {v20.8h}, [x9], x1
+ ld1 {v28.8h}, [x0], x1
+ ld1 {v21.8h}, [x9], x1
+ ld1 {v29.8h}, [x0], x1
+ ld1 {v22.8h}, [x9], x1
+ ld1 {v30.8h}, [x0], x1
+ ld1 {v23.8h}, [x9], x1
+ ld1 {v31.8h}, [x0], x1
+ sub x0, x0, x1, lsl #3
+ sub x9, x9, x1, lsl #3
+
+ // The 16x8 pixels read above is in two 8x8 blocks; the left
+ // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
+ // of this, to get one column per register.
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
+
+ loop_filter_16
+
+ transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
+ transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
+
+ st1 {v16.8h}, [x9], x1
+ st1 {v10.8h}, [x0], x1
+ st1 {v2.8h}, [x9], x1
+ st1 {v11.8h}, [x0], x1
+ st1 {v3.8h}, [x9], x1
+ st1 {v12.8h}, [x0], x1
+ st1 {v4.8h}, [x9], x1
+ st1 {v13.8h}, [x0], x1
+ st1 {v5.8h}, [x9], x1
+ st1 {v14.8h}, [x0], x1
+ st1 {v6.8h}, [x9], x1
+ st1 {v15.8h}, [x0], x1
+ st1 {v8.8h}, [x9], x1
+ st1 {v17.8h}, [x0], x1
+ st1 {v9.8h}, [x9], x1
+ st1 {v31.8h}, [x0], x1
+ sub x0, x0, x1, lsl #3
+
+ br x10
+8:
+ // The same writeback as in loop_filter_h_8_8
+ sub x9, x0, #8
+ add x0, x9, x1, lsl #2
+ transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+ st1 {v20.8h}, [x9], x1
+ st1 {v24.8h}, [x0], x1
+ st1 {v21.8h}, [x9], x1
+ st1 {v25.8h}, [x0], x1
+ st1 {v22.8h}, [x9], x1
+ st1 {v26.8h}, [x0], x1
+ st1 {v23.8h}, [x9], x1
+ st1 {v27.8h}, [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+ br x10
+7:
+ // The same writeback as in loop_filter_h_4_8
+ sub x9, x0, #4
+ add x0, x9, x1, lsl #2
+ transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
+ st1 {v22.d}[0], [x9], x1
+ st1 {v22.d}[1], [x0], x1
+ st1 {v23.d}[0], [x9], x1
+ st1 {v23.d}[1], [x0], x1
+ st1 {v24.d}[0], [x9], x1
+ st1 {v24.d}[1], [x0], x1
+ st1 {v25.d}[0], [x9], x1
+ st1 {v25.d}[1], [x0], x1
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #4
+ br x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index e9c497096b..0878763020 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2016 Google Inc.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/aarch64/vp9mc_16bpp_neon.S b/libavcodec/aarch64/vp9mc_16bpp_neon.S
new file mode 100644
index 0000000000..cac6428709
--- /dev/null
+++ b/libavcodec/aarch64/vp9mc_16bpp_neon.S
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// All public functions in this file have the following signature:
+// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+// const uint8_t *ref, ptrdiff_t ref_stride,
+// int h, int mx, int my);
+
+function ff_vp9_copy128_aarch64, export=1
+1:
+ ldp x5, x6, [x2]
+ ldp x7, x8, [x2, #16]
+ stp x5, x6, [x0]
+ ldp x9, x10, [x2, #32]
+ stp x7, x8, [x0, #16]
+ subs w4, w4, #1
+ ldp x11, x12, [x2, #48]
+ stp x9, x10, [x0, #32]
+ stp x11, x12, [x0, #48]
+ ldp x5, x6, [x2, #64]
+ ldp x7, x8, [x2, #80]
+ stp x5, x6, [x0, #64]
+ ldp x9, x10, [x2, #96]
+ stp x7, x8, [x0, #80]
+ ldp x11, x12, [x2, #112]
+ stp x9, x10, [x0, #96]
+ stp x11, x12, [x0, #112]
+ add x2, x2, x3
+ add x0, x0, x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg64_16_neon, export=1
+ mov x5, x0
+ sub x1, x1, #64
+ sub x3, x3, #64
+1:
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
+ urhadd v0.8h, v0.8h, v4.8h
+ urhadd v1.8h, v1.8h, v5.8h
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
+ urhadd v2.8h, v2.8h, v6.8h
+ urhadd v3.8h, v3.8h, v7.8h
+ subs w4, w4, #1
+ urhadd v16.8h, v16.8h, v20.8h
+ urhadd v17.8h, v17.8h, v21.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64
+ urhadd v18.8h, v18.8h, v22.8h
+ urhadd v19.8h, v19.8h, v23.8h
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg32_16_neon, export=1
+ mov x5, x0
+1:
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
+ urhadd v0.8h, v0.8h, v4.8h
+ urhadd v1.8h, v1.8h, v5.8h
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
+ urhadd v2.8h, v2.8h, v6.8h
+ urhadd v3.8h, v3.8h, v7.8h
+ subs w4, w4, #2
+ urhadd v16.8h, v16.8h, v20.8h
+ urhadd v17.8h, v17.8h, v21.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1
+ urhadd v18.8h, v18.8h, v22.8h
+ urhadd v19.8h, v19.8h, v23.8h
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg16_16_neon, export=1
+1:
+ ld1 {v2.8h, v3.8h}, [x2], x3
+ ld1 {v0.8h, v1.8h}, [x0]
+ urhadd v0.8h, v0.8h, v2.8h
+ urhadd v1.8h, v1.8h, v3.8h
+ subs w4, w4, #1
+ st1 {v0.8h, v1.8h}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg8_16_neon, export=1
+ mov x5, x0
+1:
+ ld1 {v2.8h}, [x2], x3
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v3.8h}, [x2], x3
+ urhadd v0.8h, v0.8h, v2.8h
+ ld1 {v1.8h}, [x0], x1
+ urhadd v1.8h, v1.8h, v3.8h
+ subs w4, w4, #2
+ st1 {v0.8h}, [x5], x1
+ st1 {v1.8h}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_vp9_avg4_16_neon, export=1
+ mov x5, x0
+1:
+ ld1 {v2.4h}, [x2], x3
+ ld1 {v0.4h}, [x0], x1
+ ld1 {v3.4h}, [x2], x3
+ urhadd v0.4h, v0.4h, v2.4h
+ ld1 {v1.4h}, [x0], x1
+ urhadd v1.4h, v1.4h, v3.4h
+ subs w4, w4, #2
+ st1 {v0.4h}, [x5], x1
+ st1 {v1.8b}, [x5], x1
+ b.ne 1b
+ ret
+endfunc
+
+
+// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
+// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
+// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
+// for size >= 16)
+.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
+ ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+ ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
+ smlal \dst1\().4s, v20.4h, v0.h[\offset]
+ smlal \dst5\().4s, v22.4h, v0.h[\offset]
+.if \size >= 16
+ ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
+ ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
+.endif
+.if \size >= 8
+ smlal2 \dst2\().4s, v20.8h, v0.h[\offset]
+ smlal2 \dst6\().4s, v22.8h, v0.h[\offset]
+.endif
+.if \size >= 16
+ smlal \dst3\().4s, v21.4h, v0.h[\offset]
+ smlal \dst7\().4s, v23.4h, v0.h[\offset]
+ smlal2 \dst4\().4s, v21.8h, v0.h[\offset]
+ smlal2 \dst8\().4s, v23.8h, v0.h[\offset]
+.endif
+.endm
+
+
+// Instantiate a horizontal filter function for the given size.
+// This can work on 4, 8 or 16 pixels in parallel; for larger
+// widths it will do 16 pixels at a time and loop horizontally.
+// The actual width (in bytes) is passed in x5, the height in w4 and
+// the filter coefficients in x9.
+.macro do_8tap_h type, size
+function \type\()_8tap_\size\()h
+ sub x2, x2, #6
+ add x6, x0, x1
+ add x7, x2, x3
+ add x1, x1, x1
+ add x3, x3, x3
+ // Only size >= 16 loops horizontally and needs
+ // reduced dst stride
+.if \size >= 16
+ sub x1, x1, x5
+.endif
+ // size >= 16 loads two qwords and increments r2,
+ // for size 4/8 it's enough with one qword and no
+ // postincrement
+.if \size >= 16
+ sub x3, x3, x5
+ sub x3, x3, #16
+.endif
+ // Load the filter vector
+ ld1 {v0.8h}, [x9]
+1:
+.if \size >= 16
+ mov x9, x5
+.endif
+ // Load src
+.if \size >= 16
+ ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48
+ ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48
+.else
+ ld1 {v5.8h, v6.8h}, [x2]
+ ld1 {v16.8h, v17.8h}, [x7]
+.endif
+2:
+
+ smull v1.4s, v5.4h, v0.h[0]
+ smull v24.4s, v16.4h, v0.h[0]
+.if \size >= 8
+ smull2 v2.4s, v5.8h, v0.h[0]
+ smull2 v25.4s, v16.8h, v0.h[0]
+.endif
+.if \size >= 16
+ smull v3.4s, v6.4h, v0.h[0]
+ smull v26.4s, v17.4h, v0.h[0]
+ smull2 v4.4s, v6.8h, v0.h[0]
+ smull2 v27.4s, v17.8h, v0.h[0]
+.endif
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size
+ extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size
+
+ // Round, shift and saturate
+ // The sqrshrun takes care of clamping negative values to zero, but
+ // we manually need to do umin with the max pixel value.
+ sqrshrun v1.4h, v1.4s, #7
+ sqrshrun v24.4h, v24.4s, #7
+.if \size >= 8
+ sqrshrun2 v1.8h, v2.4s, #7
+ sqrshrun2 v24.8h, v25.4s, #7
+ umin v1.8h, v1.8h, v31.8h
+ umin v24.8h, v24.8h, v31.8h
+.if \size >= 16
+ sqrshrun v2.4h, v3.4s, #7
+ sqrshrun v25.4h, v26.4s, #7
+ sqrshrun2 v2.8h, v4.4s, #7
+ sqrshrun2 v25.8h, v27.4s, #7
+ umin v2.8h, v2.8h, v31.8h
+ umin v25.8h, v25.8h, v31.8h
+.endif
+.else
+ umin v1.4h, v1.4h, v31.4h
+ umin v24.4h, v24.4h, v31.4h
+.endif
+ // Average
+.ifc \type,avg
+.if \size >= 16
+ ld1 {v3.8h, v4.8h}, [x0]
+ ld1 {v29.8h, v30.8h}, [x6]
+ urhadd v1.8h, v1.8h, v3.8h
+ urhadd v2.8h, v2.8h, v4.8h
+ urhadd v24.8h, v24.8h, v29.8h
+ urhadd v25.8h, v25.8h, v30.8h
+.elseif \size >= 8
+ ld1 {v3.8h}, [x0]
+ ld1 {v4.8h}, [x6]
+ urhadd v1.8h, v1.8h, v3.8h
+ urhadd v24.8h, v24.8h, v4.8h
+.else
+ ld1 {v3.4h}, [x0]
+ ld1 {v4.4h}, [x6]
+ urhadd v1.4h, v1.4h, v3.4h
+ urhadd v24.4h, v24.4h, v4.4h
+.endif
+.endif
+ // Store and loop horizontally (for size >= 16)
+.if \size >= 16
+ subs x9, x9, #32
+ st1 {v1.8h, v2.8h}, [x0], #32
+ st1 {v24.8h, v25.8h}, [x6], #32
+ b.eq 3f
+ mov v5.16b, v7.16b
+ mov v16.16b, v18.16b
+ ld1 {v6.8h, v7.8h}, [x2], #32
+ ld1 {v17.8h, v18.8h}, [x7], #32
+ b 2b
+.elseif \size == 8
+ st1 {v1.8h}, [x0]
+ st1 {v24.8h}, [x6]
+.else // \size == 4
+ st1 {v1.4h}, [x0]
+ st1 {v24.4h}, [x6]
+.endif
+3:
+ // Loop vertically
+ add x0, x0, x1
+ add x6, x6, x1
+ add x2, x2, x3
+ add x7, x7, x3
+ subs w4, w4, #2
+ b.ne 1b
+ ret
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size
+do_8tap_h avg, \size
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+do_8tap_h_size 16
+
+.macro do_8tap_h_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
+ mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+ movrel x6, X(ff_vp9_subpel_filters), 256*\offset
+ cmp w5, #8
+ add x9, x6, w5, uxtw #4
+ mov x5, #2*\size
+.if \size >= 16
+ b \type\()_8tap_16h
+.else
+ b \type\()_8tap_\size\()h
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size, bpp
+do_8tap_h_func put, regular, 1, \size, \bpp
+do_8tap_h_func avg, regular, 1, \size, \bpp
+do_8tap_h_func put, sharp, 2, \size, \bpp
+do_8tap_h_func avg, sharp, 2, \size, \bpp
+do_8tap_h_func put, smooth, 0, \size, \bpp
+do_8tap_h_func avg, smooth, 0, \size, \bpp
+.endm
+
+.macro do_8tap_h_filters_bpp bpp
+do_8tap_h_filters 64, \bpp
+do_8tap_h_filters 32, \bpp
+do_8tap_h_filters 16, \bpp
+do_8tap_h_filters 8, \bpp
+do_8tap_h_filters 4, \bpp
+.endm
+
+do_8tap_h_filters_bpp 10
+do_8tap_h_filters_bpp 12
+
+
+// Vertical filters
+
+// Round, shift and saturate and store reg1-reg4
+.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
+ sqrshrun \reg1\().4h, \reg1\().4s, #7
+ sqrshrun \reg2\().4h, \reg2\().4s, #7
+ sqrshrun \reg3\().4h, \reg3\().4s, #7
+ sqrshrun \reg4\().4h, \reg4\().4s, #7
+.ifc \type,avg
+ ld1 {\tmp1\().4h}, [x7], x1
+ ld1 {\tmp2\().4h}, [x7], x1
+ ld1 {\tmp3\().4h}, [x7], x1
+ ld1 {\tmp4\().4h}, [x7], x1
+.endif
+ umin \reg1\().4h, \reg1\().4h, \minreg\().4h
+ umin \reg2\().4h, \reg2\().4h, \minreg\().4h
+ umin \reg3\().4h, \reg3\().4h, \minreg\().4h
+ umin \reg4\().4h, \reg4\().4h, \minreg\().4h
+.ifc \type,avg
+ urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h
+ urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h
+ urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h
+ urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h
+.endif
+ st1 {\reg1\().4h}, [x0], x1
+ st1 {\reg2\().4h}, [x0], x1
+ st1 {\reg3\().4h}, [x0], x1
+ st1 {\reg4\().4h}, [x0], x1
+.endm
+
+// Round, shift and saturate and store reg1-8, where
+// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
+.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
+ sqrshrun \reg1\().4h, \reg1\().4s, #7
+ sqrshrun2 \reg1\().8h, \reg2\().4s, #7
+ sqrshrun \reg2\().4h, \reg3\().4s, #7
+ sqrshrun2 \reg2\().8h, \reg4\().4s, #7
+ sqrshrun \reg3\().4h, \reg5\().4s, #7
+ sqrshrun2 \reg3\().8h, \reg6\().4s, #7
+ sqrshrun \reg4\().4h, \reg7\().4s, #7
+ sqrshrun2 \reg4\().8h, \reg8\().4s, #7
+.ifc \type,avg
+ ld1 {\reg5\().8h}, [x7], x1
+ ld1 {\reg6\().8h}, [x7], x1
+ ld1 {\reg7\().8h}, [x7], x1
+ ld1 {\reg8\().8h}, [x7], x1
+.endif
+ umin \reg1\().8h, \reg1\().8h, \minreg\().8h
+ umin \reg2\().8h, \reg2\().8h, \minreg\().8h
+ umin \reg3\().8h, \reg3\().8h, \minreg\().8h
+ umin \reg4\().8h, \reg4\().8h, \minreg\().8h
+.ifc \type,avg
+ urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h
+ urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h
+ urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h
+ urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h
+.endif
+ st1 {\reg1\().8h}, [x0], x1
+ st1 {\reg2\().8h}, [x0], x1
+ st1 {\reg3\().8h}, [x0], x1
+ st1 {\reg4\().8h}, [x0], x1
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+// (src1-src8 into dst1, src2-src9 into dst2).
+.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
+ smull \dst1\().4s, \src1\().4h, v0.h[0]
+ smull \dst2\().4s, \src2\().4h, v0.h[0]
+ smull \tmp1\().4s, \src2\().4h, v0.h[1]
+ smull \tmp2\().4s, \src3\().4h, v0.h[1]
+ smlal \dst1\().4s, \src3\().4h, v0.h[2]
+ smlal \dst2\().4s, \src4\().4h, v0.h[2]
+ smlal \tmp1\().4s, \src4\().4h, v0.h[3]
+ smlal \tmp2\().4s, \src5\().4h, v0.h[3]
+ smlal \dst1\().4s, \src5\().4h, v0.h[4]
+ smlal \dst2\().4s, \src6\().4h, v0.h[4]
+ smlal \tmp1\().4s, \src6\().4h, v0.h[5]
+ smlal \tmp2\().4s, \src7\().4h, v0.h[5]
+ smlal \dst1\().4s, \src7\().4h, v0.h[6]
+ smlal \dst2\().4s, \src8\().4h, v0.h[6]
+ smlal \tmp1\().4s, \src8\().4h, v0.h[7]
+ smlal \tmp2\().4s, \src9\().4h, v0.h[7]
+ add \dst1\().4s, \dst1\().4s, \tmp1\().4s
+ add \dst2\().4s, \dst2\().4s, \tmp2\().4s
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
+// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
+.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
+ smull \dst1\().4s, \src1\().4h, v0.h[0]
+ smull2 \dst2\().4s, \src1\().8h, v0.h[0]
+ smull \dst3\().4s, \src2\().4h, v0.h[0]
+ smull2 \dst4\().4s, \src2\().8h, v0.h[0]
+ smlal \dst1\().4s, \src2\().4h, v0.h[1]
+ smlal2 \dst2\().4s, \src2\().8h, v0.h[1]
+ smlal \dst3\().4s, \src3\().4h, v0.h[1]
+ smlal2 \dst4\().4s, \src3\().8h, v0.h[1]
+ smlal \dst1\().4s, \src3\().4h, v0.h[2]
+ smlal2 \dst2\().4s, \src3\().8h, v0.h[2]
+ smlal \dst3\().4s, \src4\().4h, v0.h[2]
+ smlal2 \dst4\().4s, \src4\().8h, v0.h[2]
+ smlal \dst1\().4s, \src4\().4h, v0.h[3]
+ smlal2 \dst2\().4s, \src4\().8h, v0.h[3]
+ smlal \dst3\().4s, \src5\().4h, v0.h[3]
+ smlal2 \dst4\().4s, \src5\().8h, v0.h[3]
+ smlal \dst1\().4s, \src5\().4h, v0.h[4]
+ smlal2 \dst2\().4s, \src5\().8h, v0.h[4]
+ smlal \dst3\().4s, \src6\().4h, v0.h[4]
+ smlal2 \dst4\().4s, \src6\().8h, v0.h[4]
+ smlal \dst1\().4s, \src6\().4h, v0.h[5]
+ smlal2 \dst2\().4s, \src6\().8h, v0.h[5]
+ smlal \dst3\().4s, \src7\().4h, v0.h[5]
+ smlal2 \dst4\().4s, \src7\().8h, v0.h[5]
+ smlal \dst1\().4s, \src7\().4h, v0.h[6]
+ smlal2 \dst2\().4s, \src7\().8h, v0.h[6]
+ smlal \dst3\().4s, \src8\().4h, v0.h[6]
+ smlal2 \dst4\().4s, \src8\().8h, v0.h[6]
+ smlal \dst1\().4s, \src8\().4h, v0.h[7]
+ smlal2 \dst2\().4s, \src8\().8h, v0.h[7]
+ smlal \dst3\().4s, \src9\().4h, v0.h[7]
+ smlal2 \dst4\().4s, \src9\().8h, v0.h[7]
+.endm
+
+// Instantiate a vertical filter function for filtering 8 pixels at a time.
+// The height is passed in x4, the width in x5 and the filter coefficients
+// in x6.
+.macro do_8tap_8v type
+function \type\()_8tap_8v
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v0.8h}, [x6]
+1:
+.ifc \type,avg
+ mov x7, x0
+.endif
+ mov x6, x4
+
+ ld1 {v17.8h}, [x2], x3
+ ld1 {v18.8h}, [x2], x3
+ ld1 {v19.8h}, [x2], x3
+ ld1 {v20.8h}, [x2], x3
+ ld1 {v21.8h}, [x2], x3
+ ld1 {v22.8h}, [x2], x3
+ ld1 {v23.8h}, [x2], x3
+2:
+ ld1 {v24.8h}, [x2], x3
+ ld1 {v25.8h}, [x2], x3
+ ld1 {v26.8h}, [x2], x3
+ ld1 {v27.8h}, [x2], x3
+
+ convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25
+ convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
+ do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type
+
+ subs x6, x6, #4
+ b.eq 8f
+
+ ld1 {v16.8h}, [x2], x3
+ ld1 {v17.8h}, [x2], x3
+ ld1 {v18.8h}, [x2], x3
+ ld1 {v19.8h}, [x2], x3
+ convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17
+ convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
+ do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type
+
+ subs x6, x6, #4
+ b.eq 8f
+
+ ld1 {v20.8h}, [x2], x3
+ ld1 {v21.8h}, [x2], x3
+ ld1 {v22.8h}, [x2], x3
+ ld1 {v23.8h}, [x2], x3
+ convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21
+ convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
+ do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type
+
+ subs x6, x6, #4
+ b.ne 2b
+
+8:
+ subs x5, x5, #8
+ b.eq 9f
+ // x0 -= h * dst_stride
+ msub x0, x1, x4, x0
+ // x2 -= h * src_stride
+ msub x2, x3, x4, x2
+ // x2 -= 8 * src_stride
+ sub x2, x2, x3, lsl #3
+ // x2 += 1 * src_stride
+ add x2, x2, x3
+ add x2, x2, #16
+ add x0, x0, #16
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+do_8tap_8v put
+do_8tap_8v avg
+
+
+// Instantiate a vertical filter function for filtering a 4 pixels wide
+// slice. This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type
+function \type\()_8tap_4v
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ ld1 {v0.8h}, [x6]
+.ifc \type,avg
+ mov x7, x0
+.endif
+
+ ld1 {v16.4h}, [x2], x3
+ ld1 {v17.4h}, [x2], x3
+ ld1 {v18.4h}, [x2], x3
+ ld1 {v19.4h}, [x2], x3
+ ld1 {v20.4h}, [x2], x3
+ ld1 {v21.4h}, [x2], x3
+ ld1 {v22.4h}, [x2], x3
+ ld1 {v23.4h}, [x2], x3
+ ld1 {v24.4h}, [x2], x3
+ ld1 {v25.4h}, [x2], x3
+ ld1 {v26.4h}, [x2], x3
+
+ convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
+ convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
+ do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type
+
+ subs x4, x4, #4
+ b.eq 9f
+
+ ld1 {v27.4h}, [x2], x3
+ ld1 {v28.4h}, [x2], x3
+ ld1 {v29.4h}, [x2], x3
+ ld1 {v30.4h}, [x2], x3
+
+ convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
+ convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
+ do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type
+
+9:
+ ret
+endfunc
+.endm
+
+do_8tap_4v put
+do_8tap_4v avg
+
+
+.macro do_8tap_v_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
+ uxtw x4, w4
+ mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+ movrel x5, X(ff_vp9_subpel_filters), 256*\offset
+ add x6, x5, w6, uxtw #4
+ mov x5, #\size
+.if \size >= 8
+ b \type\()_8tap_8v
+.else
+ b \type\()_8tap_4v
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size, bpp
+do_8tap_v_func put, regular, 1, \size, \bpp
+do_8tap_v_func avg, regular, 1, \size, \bpp
+do_8tap_v_func put, sharp, 2, \size, \bpp
+do_8tap_v_func avg, sharp, 2, \size, \bpp
+do_8tap_v_func put, smooth, 0, \size, \bpp
+do_8tap_v_func avg, smooth, 0, \size, \bpp
+.endm
+
+.macro do_8tap_v_filters_bpp bpp
+do_8tap_v_filters 64, \bpp
+do_8tap_v_filters 32, \bpp
+do_8tap_v_filters 16, \bpp
+do_8tap_v_filters 8, \bpp
+do_8tap_v_filters 4, \bpp
+.endm
+
+do_8tap_v_filters_bpp 10
+do_8tap_v_filters_bpp 12
diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
index 584c114269..f67624ca04 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2016 Google Inc.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -269,8 +269,7 @@ function \type\()_8tap_\size\()h_\idx1\idx2
sub x3, x3, #8
.endif
// Load the filter vector
- ld1 {v0.8b}, [x9]
- sxtl v0.8h, v0.8b
+ ld1 {v0.8h}, [x9]
1:
.if \size >= 16
mov x9, x5
@@ -384,9 +383,9 @@ do_8tap_h_size 16
.macro do_8tap_h_func type, filter, offset, size
function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
- movrel x6, X(ff_vp9_subpel_filters), 120*\offset - 8
+ movrel x6, X(ff_vp9_subpel_filters), 256*\offset
cmp w5, #8
- add x9, x6, w5, uxtw #3
+ add x9, x6, w5, uxtw #4
mov x5, #\size
.if \size >= 16
b.ge \type\()_8tap_16h_34
@@ -516,8 +515,7 @@ do_8tap_h_filters 4
function \type\()_8tap_8v_\idx1\idx2
sub x2, x2, x3, lsl #1
sub x2, x2, x3
- ld1 {v0.8b}, [x6]
- sxtl v0.8h, v0.8b
+ ld1 {v0.8h}, [x6]
1:
.ifc \type,avg
mov x7, x0
@@ -590,8 +588,7 @@ do_8tap_8v avg, 4, 3
function \type\()_8tap_4v_\idx1\idx2
sub x2, x2, x3, lsl #1
sub x2, x2, x3
- ld1 {v0.8b}, [x6]
- sxtl v0.8h, v0.8b
+ ld1 {v0.8h}, [x6]
.ifc \type,avg
mov x7, x0
.endif
@@ -660,9 +657,9 @@ do_8tap_4v avg, 4, 3
.macro do_8tap_v_func type, filter, offset, size
function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
uxtw x4, w4
- movrel x5, X(ff_vp9_subpel_filters), 120*\offset - 8
+ movrel x5, X(ff_vp9_subpel_filters), 256*\offset
cmp w6, #8
- add x6, x5, w6, uxtw #3
+ add x6, x5, w6, uxtw #4
mov x5, #\size
.if \size >= 8
b.ge \type\()_8tap_8v_34