summaryrefslogtreecommitdiff
path: root/libavcodec/arm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/arm')
-rw-r--r--libavcodec/arm/Makefile28
-rw-r--r--libavcodec/arm/aac.h8
-rw-r--r--libavcodec/arm/aacpsdsp_init_arm.c8
-rw-r--r--libavcodec/arm/aacpsdsp_neon.S8
-rw-r--r--libavcodec/arm/ac3dsp_arm.S8
-rw-r--r--libavcodec/arm/ac3dsp_armv6.S8
-rw-r--r--libavcodec/arm/ac3dsp_init_arm.c18
-rw-r--r--libavcodec/arm/ac3dsp_neon.S52
-rw-r--r--libavcodec/arm/asm-offsets.h8
-rw-r--r--libavcodec/arm/audiodsp_arm.h8
-rw-r--r--libavcodec/arm/audiodsp_init_arm.c8
-rw-r--r--libavcodec/arm/audiodsp_init_neon.c8
-rw-r--r--libavcodec/arm/audiodsp_neon.S8
-rw-r--r--libavcodec/arm/blockdsp_arm.h10
-rw-r--r--libavcodec/arm/blockdsp_init_arm.c12
-rw-r--r--libavcodec/arm/blockdsp_init_neon.c16
-rw-r--r--libavcodec/arm/blockdsp_neon.S8
-rw-r--r--libavcodec/arm/cabac.h14
-rw-r--r--libavcodec/arm/dca.h85
-rw-r--r--libavcodec/arm/dcadsp_neon.S64
-rw-r--r--libavcodec/arm/dcadsp_vfp.S476
-rw-r--r--libavcodec/arm/fft_fixed_init_arm.c20
-rw-r--r--libavcodec/arm/fft_fixed_neon.S8
-rw-r--r--libavcodec/arm/fft_init_arm.c25
-rw-r--r--libavcodec/arm/fft_neon.S8
-rw-r--r--libavcodec/arm/fft_vfp.S8
-rw-r--r--libavcodec/arm/flacdsp_arm.S8
-rw-r--r--libavcodec/arm/flacdsp_init_arm.c14
-rw-r--r--libavcodec/arm/fmtconvert_init_arm.c8
-rw-r--r--libavcodec/arm/fmtconvert_neon.S8
-rw-r--r--libavcodec/arm/fmtconvert_vfp.S8
-rw-r--r--libavcodec/arm/g722dsp_init_arm.c8
-rw-r--r--libavcodec/arm/g722dsp_neon.S8
-rw-r--r--libavcodec/arm/h264chroma_init_arm.c8
-rw-r--r--libavcodec/arm/h264cmc_neon.S10
-rw-r--r--libavcodec/arm/h264dsp_init_arm.c14
-rw-r--r--libavcodec/arm/h264dsp_neon.S8
-rw-r--r--libavcodec/arm/h264idct_neon.S8
-rw-r--r--libavcodec/arm/h264pred_init_arm.c10
-rw-r--r--libavcodec/arm/h264pred_neon.S8
-rw-r--r--libavcodec/arm/h264qpel_init_arm.c8
-rw-r--r--libavcodec/arm/h264qpel_neon.S8
-rw-r--r--libavcodec/arm/hevcdsp_arm.h26
-rw-r--r--libavcodec/arm/hevcdsp_deblock_neon.S385
-rw-r--r--libavcodec/arm/hevcdsp_idct_neon.S465
-rw-r--r--libavcodec/arm/hevcdsp_init_arm.c32
-rw-r--r--libavcodec/arm/hevcdsp_init_neon.c224
-rw-r--r--libavcodec/arm/hevcdsp_qpel_neon.S999
-rw-r--r--libavcodec/arm/hpeldsp_arm.S8
-rw-r--r--libavcodec/arm/hpeldsp_arm.h10
-rw-r--r--libavcodec/arm/hpeldsp_armv6.S8
-rw-r--r--libavcodec/arm/hpeldsp_init_arm.c8
-rw-r--r--libavcodec/arm/hpeldsp_init_armv6.c8
-rw-r--r--libavcodec/arm/hpeldsp_init_neon.c8
-rw-r--r--libavcodec/arm/hpeldsp_neon.S8
-rw-r--r--libavcodec/arm/idct.h8
-rw-r--r--libavcodec/arm/idctdsp_arm.S10
-rw-r--r--libavcodec/arm/idctdsp_arm.h8
-rw-r--r--libavcodec/arm/idctdsp_armv6.S8
-rw-r--r--libavcodec/arm/idctdsp_init_arm.c14
-rw-r--r--libavcodec/arm/idctdsp_init_armv5te.c11
-rw-r--r--libavcodec/arm/idctdsp_init_armv6.c14
-rw-r--r--libavcodec/arm/idctdsp_init_neon.c17
-rw-r--r--libavcodec/arm/idctdsp_neon.S8
-rw-r--r--libavcodec/arm/int_neon.S13
-rw-r--r--libavcodec/arm/lossless_audiodsp_init_arm.c (renamed from libavcodec/arm/apedsp_init_arm.c)12
-rw-r--r--libavcodec/arm/lossless_audiodsp_neon.S (renamed from libavcodec/arm/apedsp_neon.S)10
-rw-r--r--libavcodec/arm/mathops.h8
-rw-r--r--libavcodec/arm/mdct_fixed_init_arm.c40
-rw-r--r--libavcodec/arm/mdct_fixed_neon.S8
-rw-r--r--libavcodec/arm/mdct_init_arm.c47
-rw-r--r--libavcodec/arm/mdct_neon.S8
-rw-r--r--libavcodec/arm/mdct_vfp.S8
-rw-r--r--libavcodec/arm/me_cmp_armv6.S8
-rw-r--r--libavcodec/arm/me_cmp_init_arm.c8
-rw-r--r--libavcodec/arm/mlpdsp_armv5te.S8
-rw-r--r--libavcodec/arm/mlpdsp_armv6.S8
-rw-r--r--libavcodec/arm/mlpdsp_init_arm.c8
-rw-r--r--libavcodec/arm/mpegaudiodsp_fixed_armv6.S8
-rw-r--r--libavcodec/arm/mpegaudiodsp_init_arm.c8
-rw-r--r--libavcodec/arm/mpegvideo_arm.c8
-rw-r--r--libavcodec/arm/mpegvideo_arm.h8
-rw-r--r--libavcodec/arm/mpegvideo_armv5te.c13
-rw-r--r--libavcodec/arm/mpegvideo_armv5te_s.S8
-rw-r--r--libavcodec/arm/mpegvideo_neon.S8
-rw-r--r--libavcodec/arm/mpegvideoencdsp_armv6.S8
-rw-r--r--libavcodec/arm/mpegvideoencdsp_init_arm.c8
-rw-r--r--libavcodec/arm/neon.S8
-rw-r--r--libavcodec/arm/neontest.c28
-rw-r--r--libavcodec/arm/pixblockdsp_armv6.S8
-rw-r--r--libavcodec/arm/pixblockdsp_init_arm.c8
-rw-r--r--libavcodec/arm/rdft_init_arm.c8
-rw-r--r--libavcodec/arm/rdft_neon.S8
-rw-r--r--libavcodec/arm/rv34dsp_init_arm.c8
-rw-r--r--libavcodec/arm/rv34dsp_neon.S8
-rw-r--r--libavcodec/arm/rv40dsp_init_arm.c8
-rw-r--r--libavcodec/arm/rv40dsp_neon.S8
-rw-r--r--libavcodec/arm/sbrdsp_init_arm.c8
-rw-r--r--libavcodec/arm/sbrdsp_neon.S8
-rw-r--r--libavcodec/arm/simple_idct_arm.S10
-rw-r--r--libavcodec/arm/simple_idct_armv5te.S8
-rw-r--r--libavcodec/arm/simple_idct_armv6.S8
-rw-r--r--libavcodec/arm/simple_idct_neon.S8
-rw-r--r--libavcodec/arm/startcode.h8
-rw-r--r--libavcodec/arm/startcode_armv6.S8
-rw-r--r--libavcodec/arm/synth_filter_init_arm.c (renamed from libavcodec/arm/dcadsp_init_arm.c)40
-rw-r--r--libavcodec/arm/synth_filter_neon.S8
-rw-r--r--libavcodec/arm/synth_filter_vfp.S8
-rw-r--r--libavcodec/arm/vc1dsp.h8
-rw-r--r--libavcodec/arm/vc1dsp_init_arm.c10
-rw-r--r--libavcodec/arm/vc1dsp_init_neon.c113
-rw-r--r--libavcodec/arm/vc1dsp_neon.S8
-rw-r--r--libavcodec/arm/videodsp_arm.h8
-rw-r--r--libavcodec/arm/videodsp_armv5te.S8
-rw-r--r--libavcodec/arm/videodsp_init_arm.c8
-rw-r--r--libavcodec/arm/videodsp_init_armv5te.c10
-rw-r--r--libavcodec/arm/vorbisdsp_init_arm.c8
-rw-r--r--libavcodec/arm/vorbisdsp_neon.S8
-rw-r--r--libavcodec/arm/vp3dsp_init_arm.c8
-rw-r--r--libavcodec/arm/vp3dsp_neon.S8
-rw-r--r--libavcodec/arm/vp56_arith.h8
-rw-r--r--libavcodec/arm/vp6dsp_init_arm.c8
-rw-r--r--libavcodec/arm/vp6dsp_neon.S8
-rw-r--r--libavcodec/arm/vp8.h8
-rw-r--r--libavcodec/arm/vp8_armv6.S8
-rw-r--r--libavcodec/arm/vp8dsp.h8
-rw-r--r--libavcodec/arm/vp8dsp_armv6.S8
-rw-r--r--libavcodec/arm/vp8dsp_init_arm.c8
-rw-r--r--libavcodec/arm/vp8dsp_init_armv6.c8
-rw-r--r--libavcodec/arm/vp8dsp_init_neon.c8
-rw-r--r--libavcodec/arm/vp8dsp_neon.S8
-rw-r--r--libavcodec/arm/vp9dsp_init.h29
-rw-r--r--libavcodec/arm/vp9dsp_init_10bpp_arm.c23
-rw-r--r--libavcodec/arm/vp9dsp_init_12bpp_arm.c23
-rw-r--r--libavcodec/arm/vp9dsp_init_16bpp_arm_template.c256
-rw-r--r--libavcodec/arm/vp9dsp_init_arm.c257
-rw-r--r--libavcodec/arm/vp9itxfm_16bpp_neon.S1945
-rw-r--r--libavcodec/arm/vp9itxfm_neon.S1688
-rw-r--r--libavcodec/arm/vp9lpf_16bpp_neon.S1044
-rw-r--r--libavcodec/arm/vp9lpf_neon.S959
-rw-r--r--libavcodec/arm/vp9mc_16bpp_neon.S615
-rw-r--r--libavcodec/arm/vp9mc_neon.S720
142 files changed, 10441 insertions, 1267 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index bd4dd4e4ce..1eeac5449e 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,8 +21,7 @@ OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \
arm/idctdsp_arm.o \
arm/jrevdct_arm.o \
arm/simple_idct_arm.o
-OBJS-$(CONFIG_MDCT) += arm/mdct_init_arm.o \
- arm/mdct_fixed_init_arm.o
+OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_init_arm.o
OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_init_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
@@ -39,12 +38,15 @@ OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_arm.o
# decoders/encoders
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
arm/sbrdsp_init_arm.o
-OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_init_arm.o
-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
+OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o
+OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
+OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o \
+ arm/vp9dsp_init_12bpp_arm.o \
+ arm/vp9dsp_init_arm.o
# ARMv5 optimizations
@@ -89,8 +91,7 @@ VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
# decoders/encoders
-VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
- arm/synth_filter_vfp.o
+VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
# NEON optimizations
@@ -130,11 +131,20 @@ NEON-OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_neon.o \
# decoders/encoders
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
arm/sbrdsp_neon.o
-NEON-OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_neon.o
-NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
- arm/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
+ arm/hevcdsp_deblock_neon.o \
+ arm/hevcdsp_idct_neon.o \
+ arm/hevcdsp_qpel_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
arm/rv40dsp_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
+NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_16bpp_neon.o \
+ arm/vp9itxfm_neon.o \
+ arm/vp9lpf_16bpp_neon.o \
+ arm/vp9lpf_neon.o \
+ arm/vp9mc_16bpp_neon.o \
+ arm/vp9mc_neon.o
diff --git a/libavcodec/arm/aac.h b/libavcodec/arm/aac.h
index 4f143cb8a9..cafa881fc7 100644
--- a/libavcodec/arm/aac.h
+++ b/libavcodec/arm/aac.h
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/aacpsdsp_init_arm.c b/libavcodec/arm/aacpsdsp_init_arm.c
index 6326376004..e04787caae 100644
--- a/libavcodec/arm/aacpsdsp_init_arm.c
+++ b/libavcodec/arm/aacpsdsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/aacpsdsp_neon.S b/libavcodec/arm/aacpsdsp_neon.S
index fb00900a4d..a93bbfea9c 100644
--- a/libavcodec/arm/aacpsdsp_neon.S
+++ b/libavcodec/arm/aacpsdsp_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/ac3dsp_arm.S b/libavcodec/arm/ac3dsp_arm.S
index ed8eb37845..1aea190de9 100644
--- a/libavcodec/arm/ac3dsp_arm.S
+++ b/libavcodec/arm/ac3dsp_arm.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/ac3dsp_armv6.S b/libavcodec/arm/ac3dsp_armv6.S
index 2028d0b89f..1d2563d4f7 100644
--- a/libavcodec/arm/ac3dsp_armv6.S
+++ b/libavcodec/arm/ac3dsp_armv6.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/ac3dsp_init_arm.c b/libavcodec/arm/ac3dsp_init_arm.c
index a48353a099..a3c32ff407 100644
--- a/libavcodec/arm/ac3dsp_init_arm.c
+++ b/libavcodec/arm/ac3dsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -33,6 +33,14 @@ void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len);
void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
const int16_t *window, unsigned n);
+void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
+ const int32_t *coef0,
+ const int32_t *coef1,
+ int len);
+void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
+ const float *coef0,
+ const float *coef1,
+ int len);
void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
int start, int end,
@@ -59,5 +67,7 @@ av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
c->float_to_fixed24 = ff_float_to_fixed24_neon;
c->extract_exponents = ff_ac3_extract_exponents_neon;
c->apply_window_int16 = ff_apply_window_int16_neon;
+ c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
+ c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
}
}
diff --git a/libavcodec/arm/ac3dsp_neon.S b/libavcodec/arm/ac3dsp_neon.S
index f97b1907df..89d0ae8048 100644
--- a/libavcodec/arm/ac3dsp_neon.S
+++ b/libavcodec/arm/ac3dsp_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -131,3 +131,47 @@ function ff_apply_window_int16_neon, export=1
pop {r4,pc}
endfunc
+
+function ff_ac3_sum_square_butterfly_int32_neon, export=1
+ vmov.i64 q0, #0
+ vmov.i64 q1, #0
+ vmov.i64 q2, #0
+ vmov.i64 q3, #0
+1:
+ vld1.32 {d16}, [r1]!
+ vld1.32 {d17}, [r2]!
+ vadd.s32 d18, d16, d17
+ vsub.s32 d19, d16, d17
+ vmlal.s32 q0, d16, d16
+ vmlal.s32 q1, d17, d17
+ vmlal.s32 q2, d18, d18
+ vmlal.s32 q3, d19, d19
+ subs r3, r3, #2
+ bgt 1b
+ vadd.s64 d0, d0, d1
+ vadd.s64 d1, d2, d3
+ vadd.s64 d2, d4, d5
+ vadd.s64 d3, d6, d7
+ vst1.64 {q0-q1}, [r0]
+ bx lr
+endfunc
+
+function ff_ac3_sum_square_butterfly_float_neon, export=1
+ vmov.f32 q0, #0.0
+ vmov.f32 q1, #0.0
+1:
+ vld1.32 {d16}, [r1]!
+ vld1.32 {d17}, [r2]!
+ vadd.f32 d18, d16, d17
+ vsub.f32 d19, d16, d17
+ vmla.f32 d0, d16, d16
+ vmla.f32 d1, d17, d17
+ vmla.f32 d2, d18, d18
+ vmla.f32 d3, d19, d19
+ subs r3, r3, #2
+ bgt 1b
+ vpadd.f32 d0, d0, d1
+ vpadd.f32 d1, d2, d3
+ vst1.32 {q0}, [r0]
+ bx lr
+endfunc
diff --git a/libavcodec/arm/asm-offsets.h b/libavcodec/arm/asm-offsets.h
index 0ea2f04e4a..a2174b0a08 100644
--- a/libavcodec/arm/asm-offsets.h
+++ b/libavcodec/arm/asm-offsets.h
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Mans Rullgard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/audiodsp_arm.h b/libavcodec/arm/audiodsp_arm.h
index e97e804de7..213660dae7 100644
--- a/libavcodec/arm/audiodsp_arm.h
+++ b/libavcodec/arm/audiodsp_arm.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/audiodsp_init_arm.c b/libavcodec/arm/audiodsp_init_arm.c
index ea9ec3ca10..74aa52a4ef 100644
--- a/libavcodec/arm/audiodsp_init_arm.c
+++ b/libavcodec/arm/audiodsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* ARM optimized audio functions
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/audiodsp_init_neon.c b/libavcodec/arm/audiodsp_init_neon.c
index af532724c8..f7bd162482 100644
--- a/libavcodec/arm/audiodsp_init_neon.c
+++ b/libavcodec/arm/audiodsp_init_neon.c
@@ -2,20 +2,20 @@
* ARM NEON optimised audio functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/audiodsp_neon.S b/libavcodec/arm/audiodsp_neon.S
index dfb998de32..ab32cef7ab 100644
--- a/libavcodec/arm/audiodsp_neon.S
+++ b/libavcodec/arm/audiodsp_neon.S
@@ -2,20 +2,20 @@
* ARM NEON optimised audio functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/blockdsp_arm.h b/libavcodec/arm/blockdsp_arm.h
index 6d9c2c3ed2..59ebeb8466 100644
--- a/libavcodec/arm/blockdsp_arm.h
+++ b/libavcodec/arm/blockdsp_arm.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -21,6 +21,6 @@
#include "libavcodec/blockdsp.h"
-void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth);
+void ff_blockdsp_init_neon(BlockDSPContext *c);
#endif /* AVCODEC_ARM_BLOCKDSP_ARM_H */
diff --git a/libavcodec/arm/blockdsp_init_arm.c b/libavcodec/arm/blockdsp_init_arm.c
index a0c03674d7..2080d5253f 100644
--- a/libavcodec/arm/blockdsp_init_arm.c
+++ b/libavcodec/arm/blockdsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* ARM optimized block operations
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -24,10 +24,10 @@
#include "libavcodec/blockdsp.h"
#include "blockdsp_arm.h"
-av_cold void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth)
+av_cold void ff_blockdsp_init_arm(BlockDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
- ff_blockdsp_init_neon(c, high_bit_depth);
+ ff_blockdsp_init_neon(c);
}
diff --git a/libavcodec/arm/blockdsp_init_neon.c b/libavcodec/arm/blockdsp_init_neon.c
index 5081cf0cdf..87c0d6d6eb 100644
--- a/libavcodec/arm/blockdsp_init_neon.c
+++ b/libavcodec/arm/blockdsp_init_neon.c
@@ -2,20 +2,20 @@
* ARM NEON optimised block operations
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -28,10 +28,8 @@
void ff_clear_block_neon(int16_t *block);
void ff_clear_blocks_neon(int16_t *blocks);
-av_cold void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth)
+av_cold void ff_blockdsp_init_neon(BlockDSPContext *c)
{
- if (!high_bit_depth) {
- c->clear_block = ff_clear_block_neon;
- c->clear_blocks = ff_clear_blocks_neon;
- }
+ c->clear_block = ff_clear_block_neon;
+ c->clear_blocks = ff_clear_blocks_neon;
}
diff --git a/libavcodec/arm/blockdsp_neon.S b/libavcodec/arm/blockdsp_neon.S
index 98df2c60c4..9fc63cba5b 100644
--- a/libavcodec/arm/blockdsp_neon.S
+++ b/libavcodec/arm/blockdsp_neon.S
@@ -2,20 +2,20 @@
* ARM NEON optimised block functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index 6ff5f1a385..fdbf86b45e 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -59,12 +59,18 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
"tst %[r_c] , %[r_c] \n\t"
"bne 2f \n\t"
"ldr %[r_c] , [%[c], %[byte]] \n\t"
+#if UNCHECKED_BITSTREAM_READER
+ "ldrh %[tmp] , [%[r_c]] \n\t"
+ "add %[r_c] , %[r_c] , #2 \n\t"
+ "str %[r_c] , [%[c], %[byte]] \n\t"
+#else
"ldr %[r_b] , [%[c], %[end]] \n\t"
"ldrh %[tmp] , [%[r_c]] \n\t"
"cmp %[r_c] , %[r_b] \n\t"
"itt lt \n\t"
"addlt %[r_c] , %[r_c] , #2 \n\t"
"strlt %[r_c] , [%[c], %[byte]] \n\t"
+#endif
"sub %[r_c] , %[low] , #1 \n\t"
"add %[r_b] , %[tables] , %[norm_off] \n\t"
"eor %[r_c] , %[low] , %[r_c] \n\t"
diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
index 4aed57603e..ae4b730a8a 100644
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -24,10 +24,9 @@
#include <stdint.h>
#include "config.h"
-#include "libavcodec/dcadsp.h"
#include "libavcodec/mathops.h"
-#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4)
+#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
#define decode_blockcodes decode_blockcodes
static inline int decode_blockcodes(int code1, int code2, int levels,
@@ -35,46 +34,44 @@ static inline int decode_blockcodes(int code1, int code2, int levels,
{
int32_t v0, v1, v2, v3, v4, v5;
- __asm__ ("smmul %8, %14, %18 \n"
- "smmul %11, %15, %18 \n"
- "smlabb %14, %8, %17, %14 \n"
- "smlabb %15, %11, %17, %15 \n"
- "smmul %9, %8, %18 \n"
- "smmul %12, %11, %18 \n"
- "sub %14, %14, %16, lsr #1 \n"
- "sub %15, %15, %16, lsr #1 \n"
- "smlabb %8, %9, %17, %8 \n"
- "smlabb %11, %12, %17, %11 \n"
- "smmul %10, %9, %18 \n"
- "smmul %13, %12, %18 \n"
- "str %14, %0 \n"
- "str %15, %4 \n"
- "sub %8, %8, %16, lsr #1 \n"
- "sub %11, %11, %16, lsr #1 \n"
- "smlabb %9, %10, %17, %9 \n"
- "smlabb %12, %13, %17, %12 \n"
- "smmul %14, %10, %18 \n"
- "smmul %15, %13, %18 \n"
- "str %8, %1 \n"
- "str %11, %5 \n"
- "sub %9, %9, %16, lsr #1 \n"
- "sub %12, %12, %16, lsr #1 \n"
- "smlabb %10, %14, %17, %10 \n"
- "smlabb %13, %15, %17, %13 \n"
- "str %9, %2 \n"
- "str %12, %6 \n"
- "sub %10, %10, %16, lsr #1 \n"
- "sub %13, %13, %16, lsr #1 \n"
- "str %10, %3 \n"
- "str %13, %7 \n"
- : "=m"(values[0]), "=m"(values[1]),
- "=m"(values[2]), "=m"(values[3]),
- "=m"(values[4]), "=m"(values[5]),
- "=m"(values[6]), "=m"(values[7]),
- "=&r"(v0), "=&r"(v1), "=&r"(v2),
+ __asm__ ("smmul %0, %6, %10 \n"
+ "smmul %3, %7, %10 \n"
+ "smlabb %6, %0, %9, %6 \n"
+ "smlabb %7, %3, %9, %7 \n"
+ "smmul %1, %0, %10 \n"
+ "smmul %4, %3, %10 \n"
+ "sub %6, %6, %8, lsr #1 \n"
+ "sub %7, %7, %8, lsr #1 \n"
+ "smlabb %0, %1, %9, %0 \n"
+ "smlabb %3, %4, %9, %3 \n"
+ "smmul %2, %1, %10 \n"
+ "smmul %5, %4, %10 \n"
+ "str %6, [%11, #0] \n"
+ "str %7, [%11, #16] \n"
+ "sub %0, %0, %8, lsr #1 \n"
+ "sub %3, %3, %8, lsr #1 \n"
+ "smlabb %1, %2, %9, %1 \n"
+ "smlabb %4, %5, %9, %4 \n"
+ "smmul %6, %2, %10 \n"
+ "smmul %7, %5, %10 \n"
+ "str %0, [%11, #4] \n"
+ "str %3, [%11, #20] \n"
+ "sub %1, %1, %8, lsr #1 \n"
+ "sub %4, %4, %8, lsr #1 \n"
+ "smlabb %2, %6, %9, %2 \n"
+ "smlabb %5, %7, %9, %5 \n"
+ "str %1, [%11, #8] \n"
+ "str %4, [%11, #24] \n"
+ "sub %2, %2, %8, lsr #1 \n"
+ "sub %5, %5, %8, lsr #1 \n"
+ "str %2, [%11, #12] \n"
+ "str %5, [%11, #28] \n"
+ : "=&r"(v0), "=&r"(v1), "=&r"(v2),
"=&r"(v3), "=&r"(v4), "=&r"(v5),
"+&r"(code1), "+&r"(code2)
- : "r"(levels - 1), "r"(-levels), "r"(ff_inverse[levels]));
+ : "r"(levels - 1), "r"(-levels),
+ "r"(ff_inverse[levels]), "r"(values)
+ : "memory");
return code1 | code2;
}
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
deleted file mode 100644
index 735c4c28e5..0000000000
--- a/libavcodec/arm/dcadsp_neon.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_dca_lfe_fir0_neon, export=1
- push {r4-r6,lr}
- mov r3, #32 @ decifactor
- mov r6, #256/32
- b dca_lfe_fir
-endfunc
-
-function ff_dca_lfe_fir1_neon, export=1
- push {r4-r6,lr}
- mov r3, #64 @ decifactor
- mov r6, #256/64
-dca_lfe_fir:
- add r4, r0, r3, lsl #2 @ out2
- add r5, r2, #256*4-16 @ cf1
- sub r1, r1, #12
- mov lr, #-16
-1:
- vmov.f32 q2, #0.0 @ v0
- vmov.f32 q3, #0.0 @ v1
- mov r12, r6
-2:
- vld1.32 {q8}, [r2,:128]! @ cf0
- vld1.32 {q9}, [r5,:128], lr @ cf1
- vld1.32 {q1}, [r1], lr @ in
- subs r12, r12, #4
- vrev64.32 q10, q8
- vmla.f32 q3, q1, q9
- vmla.f32 d4, d2, d21
- vmla.f32 d5, d3, d20
- bne 2b
-
- add r1, r1, r6, lsl #2
- subs r3, r3, #1
- vadd.f32 d4, d4, d5
- vadd.f32 d6, d6, d7
- vpadd.f32 d5, d4, d6
- vst1.32 {d5[0]}, [r0,:32]!
- vst1.32 {d5[1]}, [r4,:32]!
- bne 1b
-
- pop {r4-r6,pc}
-endfunc
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
deleted file mode 100644
index c9114d499a..0000000000
--- a/libavcodec/arm/dcadsp_vfp.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (c) 2013 RISC OS Open Ltd
- * Author: Ben Avison <bavison@riscosopen.org>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-POUT .req a1
-PIN .req a2
-PCOEF .req a3
-OLDFPSCR .req a4
-COUNTER .req ip
-
-IN0 .req s4
-IN1 .req s5
-IN2 .req s6
-IN3 .req s7
-IN4 .req s0
-IN5 .req s1
-IN6 .req s2
-IN7 .req s3
-COEF0 .req s8 @ coefficient elements
-COEF1 .req s9
-COEF2 .req s10
-COEF3 .req s11
-COEF4 .req s12
-COEF5 .req s13
-COEF6 .req s14
-COEF7 .req s15
-ACCUM0 .req s16 @ double-buffered multiply-accumulate results
-ACCUM4 .req s20
-POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
-POST1 .req s25
-POST2 .req s26
-POST3 .req s27
-
-
-.macro inner_loop decifactor, dir, tail, head
- .ifc "\dir","up"
- .set X, 0
- .set Y, 4
- .else
- .set X, 4*JMAX*4 - 4
- .set Y, -4
- .endif
- .ifnc "\head",""
- vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
- vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
- .endif
- .ifnc "\tail",""
- vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
- .endif
- .ifnc "\head",""
- vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
- .endif
- .ifnc "\head",""
- vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
- .ifc "\tail",""
- vmul.f ACCUM4, COEF4, IN1 @ vector operation
- .endif
- vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
- .ifnc "\tail",""
- vmul.f ACCUM4, COEF4, IN1 @ vector operation
- .endif
- vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
- .endif
- .ifnc "\tail",""
- vstmia POUT!, {POST0-POST3}
- .endif
- .ifnc "\head",""
- vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
- vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
- vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
- .if \decifactor == 32
- vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
- vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
- vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
- vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
- vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
- vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
- vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
- vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
- vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
- vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
- .endif
- .endif
-.endm
-
-.macro dca_lfe_fir decifactor
-function ff_dca_lfe_fir\decifactor\()_vfp, export=1
- fmrx OLDFPSCR, FPSCR
- ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
- fmxr FPSCR, ip
- vldr IN0, [PIN, #-0*4]
- vldr IN1, [PIN, #-1*4]
- vldr IN2, [PIN, #-2*4]
- vldr IN3, [PIN, #-3*4]
- .if \decifactor == 32
- .set JMAX, 8
- vpush {s16-s31}
- vldr IN4, [PIN, #-4*4]
- vldr IN5, [PIN, #-5*4]
- vldr IN6, [PIN, #-6*4]
- vldr IN7, [PIN, #-7*4]
- .else
- .set JMAX, 4
- vpush {s16-s27}
- .endif
-
- mov COUNTER, #\decifactor/4 - 1
- inner_loop \decifactor, up,, head
-1: add PCOEF, PCOEF, #4*JMAX*4
- subs COUNTER, COUNTER, #1
- inner_loop \decifactor, up, tail, head
- bne 1b
- inner_loop \decifactor, up, tail
-
- mov COUNTER, #\decifactor/4 - 1
- inner_loop \decifactor, down,, head
-1: sub PCOEF, PCOEF, #4*JMAX*4
- subs COUNTER, COUNTER, #1
- inner_loop \decifactor, down, tail, head
- bne 1b
- inner_loop \decifactor, down, tail
-
- .if \decifactor == 32
- vpop {s16-s31}
- .else
- vpop {s16-s27}
- .endif
- fmxr FPSCR, OLDFPSCR
- bx lr
-endfunc
-.endm
-
- dca_lfe_fir 64
- .ltorg
- dca_lfe_fir 32
-
- .unreq POUT
- .unreq PIN
- .unreq PCOEF
- .unreq OLDFPSCR
- .unreq COUNTER
-
- .unreq IN0
- .unreq IN1
- .unreq IN2
- .unreq IN3
- .unreq IN4
- .unreq IN5
- .unreq IN6
- .unreq IN7
- .unreq COEF0
- .unreq COEF1
- .unreq COEF2
- .unreq COEF3
- .unreq COEF4
- .unreq COEF5
- .unreq COEF6
- .unreq COEF7
- .unreq ACCUM0
- .unreq ACCUM4
- .unreq POST0
- .unreq POST1
- .unreq POST2
- .unreq POST3
-
-
-IN .req a1
-SBACT .req a2
-OLDFPSCR .req a3
-IMDCT .req a4
-WINDOW .req v1
-OUT .req v2
-BUF .req v3
-SCALEINT .req v4 @ only used in softfp case
-COUNT .req v5
-
-SCALE .req s0
-
-/* Stack layout differs in softfp and hardfp cases:
- *
- * hardfp
- * fp -> 6 arg words saved by caller
- * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
- * s16-s23 on entry
- * align 16
- * buf -> 8*32*4 bytes buffer
- * s0 on entry
- * sp -> 3 arg words for callee
- *
- * softfp
- * fp -> 7 arg words saved by caller
- * a4,v1-v5,fp,lr on entry
- * s16-s23 on entry
- * align 16
- * buf -> 8*32*4 bytes buffer
- * sp -> 4 arg words for callee
- */
-
-/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
- * SynthFilterContext *synth, FFTContext *imdct,
- * float (*synth_buf_ptr)[512],
- * int *synth_buf_offset, float (*synth_buf2)[32],
- * const float (*window)[512], float *samples_out,
- * float (*raXin)[32], float scale);
- */
-function ff_dca_qmf_32_subbands_vfp, export=1
-VFP push {a3-a4,v1-v3,v5,fp,lr}
-NOVFP push {a4,v1-v5,fp,lr}
- add fp, sp, #8*4
- vpush {s16-s23}
- @ The buffer pointed at by raXin isn't big enough for us to do a
- @ complete matrix transposition as we want to, so allocate an
- @ alternative buffer from the stack. Align to 4 words for speed.
- sub BUF, sp, #8*32*4
- bic BUF, BUF, #15
- mov sp, BUF
- ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
- fmrx OLDFPSCR, FPSCR
- fmxr FPSCR, lr
- @ COUNT is used to count down 2 things at once:
- @ bits 0-4 are the number of word pairs remaining in the output row
- @ bits 5-31 are the number of words to copy (with possible negation)
- @ from the source matrix before we start zeroing the remainder
- mov COUNT, #(-4 << 5) + 16
- adds COUNT, COUNT, SBACT, lsl #5
- bmi 2f
-1:
- vldr s8, [IN, #(0*8+0)*4]
- vldr s10, [IN, #(0*8+1)*4]
- vldr s12, [IN, #(0*8+2)*4]
- vldr s14, [IN, #(0*8+3)*4]
- vldr s16, [IN, #(0*8+4)*4]
- vldr s18, [IN, #(0*8+5)*4]
- vldr s20, [IN, #(0*8+6)*4]
- vldr s22, [IN, #(0*8+7)*4]
- vneg.f s8, s8
- vldr s9, [IN, #(1*8+0)*4]
- vldr s11, [IN, #(1*8+1)*4]
- vldr s13, [IN, #(1*8+2)*4]
- vldr s15, [IN, #(1*8+3)*4]
- vneg.f s16, s16
- vldr s17, [IN, #(1*8+4)*4]
- vldr s19, [IN, #(1*8+5)*4]
- vldr s21, [IN, #(1*8+6)*4]
- vldr s23, [IN, #(1*8+7)*4]
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- vldr s9, [IN, #(3*8+0)*4]
- vldr s11, [IN, #(3*8+1)*4]
- vldr s13, [IN, #(3*8+2)*4]
- vldr s15, [IN, #(3*8+3)*4]
- vldr s17, [IN, #(3*8+4)*4]
- vldr s19, [IN, #(3*8+5)*4]
- vldr s21, [IN, #(3*8+6)*4]
- vldr s23, [IN, #(3*8+7)*4]
- vneg.f s9, s9
- vldr s8, [IN, #(2*8+0)*4]
- vldr s10, [IN, #(2*8+1)*4]
- vldr s12, [IN, #(2*8+2)*4]
- vldr s14, [IN, #(2*8+3)*4]
- vneg.f s17, s17
- vldr s16, [IN, #(2*8+4)*4]
- vldr s18, [IN, #(2*8+5)*4]
- vldr s20, [IN, #(2*8+6)*4]
- vldr s22, [IN, #(2*8+7)*4]
- vstr d4, [BUF, #(0*32+2)*4]
- vstr d5, [BUF, #(1*32+2)*4]
- vstr d6, [BUF, #(2*32+2)*4]
- vstr d7, [BUF, #(3*32+2)*4]
- vstr d8, [BUF, #(4*32+2)*4]
- vstr d9, [BUF, #(5*32+2)*4]
- vstr d10, [BUF, #(6*32+2)*4]
- vstr d11, [BUF, #(7*32+2)*4]
- add IN, IN, #4*8*4
- add BUF, BUF, #4*4
- subs COUNT, COUNT, #(4 << 5) + 2
- bpl 1b
-2: @ Now deal with trailing < 4 samples
- adds COUNT, COUNT, #3 << 5
- bmi 4f @ sb_act was a multiple of 4
- bics lr, COUNT, #0x1F
- bne 3f
- @ sb_act was n*4+1
- vldr s8, [IN, #(0*8+0)*4]
- vldr s10, [IN, #(0*8+1)*4]
- vldr s12, [IN, #(0*8+2)*4]
- vldr s14, [IN, #(0*8+3)*4]
- vldr s16, [IN, #(0*8+4)*4]
- vldr s18, [IN, #(0*8+5)*4]
- vldr s20, [IN, #(0*8+6)*4]
- vldr s22, [IN, #(0*8+7)*4]
- vneg.f s8, s8
- vldr s9, zero
- vldr s11, zero
- vldr s13, zero
- vldr s15, zero
- vneg.f s16, s16
- vldr s17, zero
- vldr s19, zero
- vldr s21, zero
- vldr s23, zero
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- sub COUNT, COUNT, #1
- b 4f
-3: @ sb_act was n*4+2 or n*4+3, so do the first 2
- vldr s8, [IN, #(0*8+0)*4]
- vldr s10, [IN, #(0*8+1)*4]
- vldr s12, [IN, #(0*8+2)*4]
- vldr s14, [IN, #(0*8+3)*4]
- vldr s16, [IN, #(0*8+4)*4]
- vldr s18, [IN, #(0*8+5)*4]
- vldr s20, [IN, #(0*8+6)*4]
- vldr s22, [IN, #(0*8+7)*4]
- vneg.f s8, s8
- vldr s9, [IN, #(1*8+0)*4]
- vldr s11, [IN, #(1*8+1)*4]
- vldr s13, [IN, #(1*8+2)*4]
- vldr s15, [IN, #(1*8+3)*4]
- vneg.f s16, s16
- vldr s17, [IN, #(1*8+4)*4]
- vldr s19, [IN, #(1*8+5)*4]
- vldr s21, [IN, #(1*8+6)*4]
- vldr s23, [IN, #(1*8+7)*4]
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- sub COUNT, COUNT, #(2 << 5) + 1
- bics lr, COUNT, #0x1F
- bne 4f
- @ sb_act was n*4+3
- vldr s8, [IN, #(2*8+0)*4]
- vldr s10, [IN, #(2*8+1)*4]
- vldr s12, [IN, #(2*8+2)*4]
- vldr s14, [IN, #(2*8+3)*4]
- vldr s16, [IN, #(2*8+4)*4]
- vldr s18, [IN, #(2*8+5)*4]
- vldr s20, [IN, #(2*8+6)*4]
- vldr s22, [IN, #(2*8+7)*4]
- vldr s9, zero
- vldr s11, zero
- vldr s13, zero
- vldr s15, zero
- vldr s17, zero
- vldr s19, zero
- vldr s21, zero
- vldr s23, zero
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- sub COUNT, COUNT, #1
-4: @ Now fill the remainder with 0
- vldr s8, zero
- vldr s9, zero
- ands COUNT, COUNT, #0x1F
- beq 6f
-5: vstr d4, [BUF, #(0*32+0)*4]
- vstr d4, [BUF, #(1*32+0)*4]
- vstr d4, [BUF, #(2*32+0)*4]
- vstr d4, [BUF, #(3*32+0)*4]
- vstr d4, [BUF, #(4*32+0)*4]
- vstr d4, [BUF, #(5*32+0)*4]
- vstr d4, [BUF, #(6*32+0)*4]
- vstr d4, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- subs COUNT, COUNT, #1
- bne 5b
-6:
- fmxr FPSCR, OLDFPSCR
- ldr WINDOW, [fp, #3*4]
- ldr OUT, [fp, #4*4]
- sub BUF, BUF, #32*4
-NOVFP ldr SCALEINT, [fp, #6*4]
- mov COUNT, #8
-VFP vpush {SCALE}
-VFP sub sp, sp, #3*4
-NOVFP sub sp, sp, #4*4
-7:
-VFP ldr a1, [fp, #-7*4] @ imdct
-NOVFP ldr a1, [fp, #-8*4]
- ldmia fp, {a2-a4}
-VFP stmia sp, {WINDOW, OUT, BUF}
-NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
-VFP vldr SCALE, [sp, #3*4]
- bl X(ff_synth_filter_float_vfp)
- add OUT, OUT, #32*4
- add BUF, BUF, #32*4
- subs COUNT, COUNT, #1
- bne 7b
-
-A sub sp, fp, #(8+8)*4
-T sub fp, fp, #(8+8)*4
-T mov sp, fp
- vpop {s16-s23}
-VFP pop {a3-a4,v1-v3,v5,fp,pc}
-NOVFP pop {a4,v1-v5,fp,pc}
-endfunc
-
- .unreq IN
- .unreq SBACT
- .unreq OLDFPSCR
- .unreq IMDCT
- .unreq WINDOW
- .unreq OUT
- .unreq BUF
- .unreq SCALEINT
- .unreq COUNT
-
- .unreq SCALE
-
- .align 2
-zero: .word 0
diff --git a/libavcodec/arm/fft_fixed_init_arm.c b/libavcodec/arm/fft_fixed_init_arm.c
index 5132b0959f..11226d65ff 100644
--- a/libavcodec/arm/fft_fixed_init_arm.c
+++ b/libavcodec/arm/fft_fixed_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -26,6 +26,8 @@
#include "libavcodec/fft.h"
void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z);
+void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
+void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
av_cold void ff_fft_fixed_init_arm(FFTContext *s)
{
@@ -33,6 +35,16 @@ av_cold void ff_fft_fixed_init_arm(FFTContext *s)
if (have_neon(cpu_flags)) {
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
+#if CONFIG_FFT
s->fft_calc = ff_fft_fixed_calc_neon;
+#endif
+
+#if CONFIG_MDCT
+ if (!s->inverse && s->nbits >= 3) {
+ s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+ s->mdct_calc = ff_mdct_fixed_calc_neon;
+ s->mdct_calcw = ff_mdct_fixed_calcw_neon;
+ }
+#endif
}
}
diff --git a/libavcodec/arm/fft_fixed_neon.S b/libavcodec/arm/fft_fixed_neon.S
index c70a18991a..2651607544 100644
--- a/libavcodec/arm/fft_fixed_neon.S
+++ b/libavcodec/arm/fft_fixed_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 4d047eaf13..331bd65e5c 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -29,16 +29,33 @@ void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
av_cold void ff_fft_init_arm(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp_vm(cpu_flags)) {
s->fft_calc = ff_fft_calc_vfp;
+#if CONFIG_MDCT
+ s->imdct_half = ff_imdct_half_vfp;
+#endif
}
if (have_neon(cpu_flags)) {
+#if CONFIG_FFT
s->fft_permute = ff_fft_permute_neon;
s->fft_calc = ff_fft_calc_neon;
+#endif
+#if CONFIG_MDCT
+ s->imdct_calc = ff_imdct_calc_neon;
+ s->imdct_half = ff_imdct_half_neon;
+ s->mdct_calc = ff_mdct_calc_neon;
+ s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
}
}
diff --git a/libavcodec/arm/fft_neon.S b/libavcodec/arm/fft_neon.S
index b161015e39..48f8dfc424 100644
--- a/libavcodec/arm/fft_neon.S
+++ b/libavcodec/arm/fft_neon.S
@@ -7,20 +7,20 @@
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index c2801fa1a9..ac601325f2 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -2,20 +2,20 @@
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/flacdsp_arm.S b/libavcodec/arm/flacdsp_arm.S
index d4441da1bb..f8861c5967 100644
--- a/libavcodec/arm/flacdsp_arm.S
+++ b/libavcodec/arm/flacdsp_arm.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/flacdsp_init_arm.c b/libavcodec/arm/flacdsp_init_arm.c
index 0530cf7a85..564e3dc79b 100644
--- a/libavcodec/arm/flacdsp_init_arm.c
+++ b/libavcodec/arm/flacdsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -24,9 +24,9 @@
void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
int qlevel, int len);
-av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt,
+av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
int bps)
{
- if (bps <= 16)
- c->lpc = ff_flac_lpc_16_arm;
+ if (CONFIG_FLAC_DECODER)
+ c->lpc16 = ff_flac_lpc_16_arm;
}
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
index 11396e898c..a734decec0 100644
--- a/libavcodec/arm/fmtconvert_init_arm.c
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -1,20 +1,20 @@
/*
* ARM optimized Format Conversion Utils
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S
index 5d48e3d197..738953e8fc 100644
--- a/libavcodec/arm/fmtconvert_neon.S
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -3,20 +3,20 @@
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>b
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
index 4e43f425a5..b14af454eb 100644
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/g722dsp_init_arm.c b/libavcodec/arm/g722dsp_init_arm.c
index 5edf619f17..c0e5d8b989 100644
--- a/libavcodec/arm/g722dsp_init_arm.c
+++ b/libavcodec/arm/g722dsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/g722dsp_neon.S b/libavcodec/arm/g722dsp_neon.S
index 5fa3c279e9..757e53f167 100644
--- a/libavcodec/arm/g722dsp_neon.S
+++ b/libavcodec/arm/g722dsp_neon.S
@@ -2,20 +2,20 @@
* ARM NEON optimised DSP functions for G722 coding
* Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/h264chroma_init_arm.c b/libavcodec/arm/h264chroma_init_arm.c
index 6f365533cf..13f7e0d702 100644
--- a/libavcodec/arm/h264chroma_init_arm.c
+++ b/libavcodec/arm/h264chroma_init_arm.c
@@ -2,20 +2,20 @@
* ARM NEON optimised H.264 chroma functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index ee7011b00b..fc48a6f8f6 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -455,7 +455,7 @@ endconst
h264_chroma_mc4 avg, rv40
#endif
-#if CONFIG_VC1_DECODER
+#if CONFIG_VC1DSP
h264_chroma_mc8 put, vc1
h264_chroma_mc8 avg, vc1
h264_chroma_mc4 put, vc1
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index 7afd350890..90144d0da2 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -72,11 +72,14 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
+#if HAVE_NEON
if (bit_depth == 8) {
c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+ if(chroma_format_idc == 1){
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+ }
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
@@ -96,6 +99,7 @@ static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
}
+#endif // HAVE_NEON
}
av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
@@ -103,8 +107,10 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
{
int cpu_flags = av_get_cpu_flags();
+#if HAVE_ARMV6
if (have_setend(cpu_flags))
c->startcode_find_candidate = ff_startcode_find_candidate_armv6;
+#endif
if (have_neon(cpu_flags))
h264dsp_init_neon(c, bit_depth, chroma_format_idc);
}
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 5e75565b3e..274a547f26 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index f588f3e744..4f68bdb9f5 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/h264pred_init_arm.c b/libavcodec/arm/h264pred_init_arm.c
index a445d4d667..cc324d7dca 100644
--- a/libavcodec/arm/h264pred_init_arm.c
+++ b/libavcodec/arm/h264pred_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -49,6 +49,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
const int bit_depth,
const int chroma_format_idc)
{
+#if HAVE_NEON
const int high_depth = bit_depth > 8;
if (high_depth)
@@ -81,6 +82,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
+#endif // HAVE_NEON
}
av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
diff --git a/libavcodec/arm/h264pred_neon.S b/libavcodec/arm/h264pred_neon.S
index 332f94bd53..4dc47ba8f1 100644
--- a/libavcodec/arm/h264pred_neon.S
+++ b/libavcodec/arm/h264pred_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/h264qpel_init_arm.c b/libavcodec/arm/h264qpel_init_arm.c
index 01615b5719..71237be359 100644
--- a/libavcodec/arm/h264qpel_init_arm.c
+++ b/libavcodec/arm/h264qpel_init_arm.c
@@ -2,20 +2,20 @@
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/h264qpel_neon.S b/libavcodec/arm/h264qpel_neon.S
index 6c51250d5b..21336c6c32 100644
--- a/libavcodec/arm/h264qpel_neon.S
+++ b/libavcodec/arm/h264qpel_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/hevcdsp_arm.h b/libavcodec/arm/hevcdsp_arm.h
new file mode 100644
index 0000000000..7735df9cd2
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_arm.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
+#define AVCODEC_ARM_HEVCDSP_ARM_H
+
+#include "libavcodec/hevcdsp.h"
+
+void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth);
+
+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
new file mode 100644
index 0000000000..166bddb104
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_chroma_start
+ ldr r12, [r2]
+ ldr r3, [r2, #4]
+ add r2, r3, r12
+ cmp r2, #0
+ it eq
+ bxeq lr
+.endm
+
+.macro hevc_loop_filter_chroma_body
+ vsubl.u8 q3, d4, d2
+ vsubl.u8 q11, d18, d19
+ vshl.i16 q3, #2
+ vadd.i16 q11, q3
+ vdup.16 d0, r12
+ vdup.16 d1, r3
+ vrshr.s16 q11, q11, #3
+ vneg.s16 q12, q0
+ vmovl.u8 q2, d4
+ vmin.s16 q11, q11, q0
+ vmax.s16 q11, q11, q12
+ vaddw.u8 q1, q11, d2
+ vsub.i16 q2, q11
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d4, q2
+.endm
+
+.macro hevc_loop_filter_luma_start
+ ldr r12, [r3]
+ ldr r3, [r3, #4]
+ lsl r3, #16
+ orr r3, r12
+ cmp r3, #0
+ it eq
+ bxeq lr
+ lsr r3, #16
+.endm
+
+.macro hevc_loop_filter_luma_body
+ vmovl.u8 q8, d16
+ vmovl.u8 q9, d18
+ vmovl.u8 q10, d20
+ vmovl.u8 q11, d22
+ vmovl.u8 q12, d24
+ vmovl.u8 q13, d26
+ vmovl.u8 q14, d28
+ vmovl.u8 q15, d30
+
+ vadd.i16 q7, q9, q11
+ vadd.i16 q6, q14, q12
+ vsub.i16 q7, q10
+ vsub.i16 q6, q13
+ vabd.s16 q7, q7, q10
+ vabd.s16 q6, q6, q13
+
+
+ vdup.16 q0, r2
+ vmov q4, q7
+ vmov q5, q6
+ vdup.16 d4, r12
+ vtrn.16 q7, q4
+ vtrn.16 q6, q5
+
+ vshl.u64 q7, #32
+ vshr.u64 q4, #32
+ vshl.u64 q6, #32
+ vshr.u64 q5, #32
+ vshr.u64 q7, #32
+ vshr.u64 q6, #32
+ vshl.u64 q5, #32
+ vshl.u64 q4, #32
+ vorr q6, q5
+ vorr q7, q4
+ vdup.16 d5, r3
+ vadd.i16 q5, q7, q6
+
+ vmov q4, q5
+ vmov q3, q5
+ vtrn.32 q3, q4
+
+ vadd.i16 q4, q3
+
+ vshl.s16 q5, q5, #1
+ vcgt.s16 q3, q0, q4
+
+ vmovn.i16 d6, q3
+ vshr.s16 q1, q0, #2
+ vmovn.i16 d6, q3
+ vcgt.s16 q5, q1, q5
+ vmov r7, s12
+ cmp r7, #0
+ beq bypasswrite
+
+ vpadd.i32 d0, d14, d12
+ vpadd.i32 d1, d15, d13
+ vmov q4, q2
+ vshl.s16 q2, #2
+ vshr.s16 q1, q1, #1
+ vrhadd.s16 q2, q4
+
+ vabd.s16 q7, q8, q11
+ vaba.s16 q7, q15, q12
+
+ vmovn.i32 d0, q0
+ vmov r5, r6, s0, s1
+ vcgt.s16 q6, q1, q7
+ vand q5, q5, q6
+ vabd.s16 q7, q11, q12
+ vcgt.s16 q6, q2, q7
+ vand q5, q5, q6
+
+ vmov q2, q5
+ vtrn.s16 q5, q2
+ vshr.u64 q2, #32
+ vshl.u64 q5, #32
+ vshl.u64 q2, #32
+ vshr.u64 q5, #32
+ vorr q5, q2
+
+ vmov q2, q5
+ vshl.i16 q7, q4, #1
+ vtrn.32 q2, q5
+ vand q5, q2
+ vneg.s16 q6, q7
+ vmovn.i16 d4, q5
+ vmovn.i16 d4, q2
+ vmov r8, s8
+
+ and r9, r8, r7
+ cmp r9, #0
+ beq weakfilter_\@
+
+ vadd.i16 q2, q11, q12
+ vadd.i16 q4, q9, q8
+ vadd.i16 q1, q2, q10
+ vdup.16 d10, r9
+ vadd.i16 q0, q1, q9
+ vshl.i16 q4, #1
+ lsr r9, #16
+ vadd.i16 q1, q0
+ vrshr.s16 q3, q0, #2
+ vadd.i16 q1, q13
+ vadd.i16 q4, q0
+ vsub.i16 q3, q10
+ vrshr.s16 q1, #3
+ vrshr.s16 q4, #3
+ vmax.s16 q3, q6
+ vsub.i16 q1, q11
+ vsub.i16 q4, q9
+ vmin.s16 q3, q7
+ vmax.s16 q4, q6
+ vmax.s16 q1, q6
+ vadd.i16 q3, q10
+ vmin.s16 q4, q7
+ vmin.s16 q1, q7
+ vdup.16 d11, r9
+ vadd.i16 q4, q9
+ vadd.i16 q1, q11
+ vbit q9, q4, q5
+ vadd.i16 q4, q2, q13
+ vbit q11, q1, q5
+ vadd.i16 q0, q4, q14
+ vadd.i16 q2, q15, q14
+ vadd.i16 q4, q0
+
+ vshl.i16 q2, #1
+ vadd.i16 q4, q10
+ vbit q10, q3, q5
+ vrshr.s16 q4, #3
+ vadd.i16 q2, q0
+ vrshr.s16 q3, q0, #2
+ vsub.i16 q4, q12
+ vrshr.s16 q2, #3
+ vsub.i16 q3, q13
+ vmax.s16 q4, q6
+ vsub.i16 q2, q14
+ vmax.s16 q3, q6
+ vmin.s16 q4, q7
+ vmax.s16 q2, q6
+ vmin.s16 q3, q7
+ vadd.i16 q4, q12
+ vmin.s16 q2, q7
+ vadd.i16 q3, q13
+ vbit q12, q4, q5
+ vadd.i16 q2, q14
+ vbit q13, q3, q5
+ vbit q14, q2, q5
+
+weakfilter_\@:
+ mvn r8, r8
+ and r9, r8, r7
+ cmp r9, #0
+ beq ready_\@
+
+ vdup.16 q4, r2
+
+ vdup.16 d10, r9
+ lsr r9, #16
+ vmov q1, q4
+ vdup.16 d11, r9
+ vshr.s16 q1, #1
+ vsub.i16 q2, q12, q11
+ vadd.i16 q4, q1
+ vshl.s16 q0, q2, #3
+ vshr.s16 q4, #3
+ vadd.i16 q2, q0
+ vsub.i16 q0, q13, q10
+ vsub.i16 q2, q0
+ vshl.i16 q0, q0, #1
+ vsub.i16 q2, q0
+ vshl.s16 q1, q7, 2
+ vrshr.s16 q2, q2, #4
+ vadd.i16 q1, q7
+ vabs.s16 q3, q2
+ vshr.s16 q6, q6, #1
+ vcgt.s16 q1, q1, q3
+ vand q5, q1
+ vshr.s16 q7, q7, #1
+ vmax.s16 q2, q2, q6
+ vmin.s16 q2, q2, q7
+
+ vshr.s16 q7, q7, #1
+ vrhadd.s16 q3, q9, q11
+ vneg.s16 q6, q7
+ vsub.s16 q3, q10
+ vdup.16 d2, r5
+ vhadd.s16 q3, q2
+ vdup.16 d3, r6
+ vmax.s16 q3, q3, q6
+ vcgt.s16 q1, q4, q1
+ vmin.s16 q3, q3, q7
+ vand q1, q5
+ vadd.i16 q3, q10
+ lsr r5, #16
+ lsr r6, #16
+ vbit q10, q3, q1
+
+ vrhadd.s16 q3, q14, q12
+ vdup.16 d2, r5
+ vsub.s16 q3, q13
+ vdup.16 d3, r6
+ vhsub.s16 q3, q2
+ vcgt.s16 q1, q4, q1
+ vmax.s16 q3, q3, q6
+ vand q1, q5
+ vmin.s16 q3, q3, q7
+ vadd.i16 q3, q13
+ vbit q13, q3, q1
+ vadd.i16 q0, q11, q2
+ vsub.i16 q4, q12, q2
+ vbit q11, q0, q5
+ vbit q12, q4, q5
+
+ready_\@:
+ vqmovun.s16 d16, q8
+ vqmovun.s16 d18, q9
+ vqmovun.s16 d20, q10
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d26, q13
+ vqmovun.s16 d28, q14
+ vqmovun.s16 d30, q15
+.endm
+
+function ff_hevc_v_loop_filter_luma_neon, export=1
+ hevc_loop_filter_luma_start
+ push {r5-r11}
+ vpush {d8-d15}
+ sub r0, #4
+ vld1.8 {d16}, [r0], r1
+ vld1.8 {d18}, [r0], r1
+ vld1.8 {d20}, [r0], r1
+ vld1.8 {d22}, [r0], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d28}, [r0], r1
+ vld1.8 {d30}, [r0], r1
+ sub r0, r0, r1, lsl #3
+ transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+ hevc_loop_filter_luma_body
+ transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+ vst1.8 {d16}, [r0], r1
+ vst1.8 {d18}, [r0], r1
+ vst1.8 {d20}, [r0], r1
+ vst1.8 {d22}, [r0], r1
+ vst1.8 {d24}, [r0], r1
+ vst1.8 {d26}, [r0], r1
+ vst1.8 {d28}, [r0], r1
+ vst1.8 {d30}, [r0]
+ vpop {d8-d15}
+ pop {r5-r11}
+ bx lr
+endfunc
+
+function ff_hevc_h_loop_filter_luma_neon, export=1
+ hevc_loop_filter_luma_start
+ push {r5-r11}
+ vpush {d8-d15}
+ sub r0, r0, r1, lsl #2
+ vld1.8 {d16}, [r0], r1
+ vld1.8 {d18}, [r0], r1
+ vld1.8 {d20}, [r0], r1
+ vld1.8 {d22}, [r0], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d28}, [r0], r1
+ vld1.8 {d30}, [r0], r1
+ sub r0, r0, r1, lsl #3
+ add r0, r1
+ hevc_loop_filter_luma_body
+ vst1.8 {d18}, [r0], r1
+ vst1.8 {d20}, [r0], r1
+ vst1.8 {d22}, [r0], r1
+ vst1.8 {d24}, [r0], r1
+ vst1.8 {d26}, [r0], r1
+ vst1.8 {d28}, [r0]
+bypasswrite:
+ vpop {d8-d15}
+ pop {r5-r11}
+ bx lr
+endfunc
+
+function ff_hevc_v_loop_filter_chroma_neon, export=1
+ hevc_loop_filter_chroma_start
+ sub r0, #4
+ vld1.8 {d16}, [r0], r1
+ vld1.8 {d17}, [r0], r1
+ vld1.8 {d18}, [r0], r1
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d19}, [r0], r1
+ vld1.8 {d20}, [r0], r1
+ vld1.8 {d21}, [r0], r1
+ sub r0, r0, r1, lsl #3
+ transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+ hevc_loop_filter_chroma_body
+ transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+ vst1.8 {d16}, [r0], r1
+ vst1.8 {d17}, [r0], r1
+ vst1.8 {d18}, [r0], r1
+ vst1.8 {d2}, [r0], r1
+ vst1.8 {d4}, [r0], r1
+ vst1.8 {d19}, [r0], r1
+ vst1.8 {d20}, [r0], r1
+ vst1.8 {d21}, [r0]
+ bx lr
+endfunc
+
+function ff_hevc_h_loop_filter_chroma_neon, export=1
+ hevc_loop_filter_chroma_start
+ sub r0, r0, r1, lsl #1
+ vld1.8 {d18}, [r0], r1
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d19}, [r0]
+ sub r0, r0, r1, lsl #1
+ hevc_loop_filter_chroma_body
+ vst1.8 {d2}, [r0], r1
+ vst1.8 {d4}, [r0]
+ bx lr
+endfunc
diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
new file mode 100644
index 0000000000..e39d00634b
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+function ff_hevc_idct_4x4_dc_neon_8, export=1
+ ldrsh r1, [r0]
+ ldr r2, =0x20
+ add r1, #1
+ asr r1, #1
+ add r1, r2
+ asr r1, #6
+ vdup.16 q0, r1
+ vdup.16 q1, r1
+ vst1.16 {q0, q1}, [r0]
+ bx lr
+endfunc
+
+function ff_hevc_idct_8x8_dc_neon_8, export=1
+ ldrsh r1, [r0]
+ ldr r2, =0x20
+ add r1, #1
+ asr r1, #1
+ add r1, r2
+ asr r1, #6
+ vdup.16 q8, r1
+ vdup.16 q9, r1
+ vmov.16 q10, q8
+ vmov.16 q11, q8
+ vmov.16 q12, q8
+ vmov.16 q13, q8
+ vmov.16 q14, q8
+ vmov.16 q15, q8
+ vstm r0, {q8-q15}
+ bx lr
+endfunc
+
+function ff_hevc_idct_16x16_dc_neon_8, export=1
+ ldrsh r1, [r0]
+ ldr r2, =0x20
+ add r1, #1
+ asr r1, #1
+ add r1, r2
+ asr r1, #6
+ vdup.16 q8, r1
+ vdup.16 q9, r1
+ vmov.16 q10, q8
+ vmov.16 q11, q8
+ vmov.16 q12, q8
+ vmov.16 q13, q8
+ vmov.16 q14, q8
+ vmov.16 q15, q8
+ vstm r0!, {q8-q15}
+ vstm r0!, {q8-q15}
+ vstm r0!, {q8-q15}
+ vstm r0, {q8-q15}
+ bx lr
+endfunc
+
+function ff_hevc_idct_32x32_dc_neon_8, export=1
+ ldrsh r1, [r0]
+ ldr r2, =0x20
+ add r1, #1
+ asr r1, #1
+ add r1, r2
+ asr r1, #6
+ mov r3, #16
+ vdup.16 q8, r1
+ vdup.16 q9, r1
+ vmov.16 q10, q8
+ vmov.16 q11, q8
+ vmov.16 q12, q8
+ vmov.16 q13, q8
+ vmov.16 q14, q8
+ vmov.16 q15, q8
+1: subs r3, #1
+ vstm r0!, {q8-q15}
+ bne 1b
+ bx lr
+endfunc
+
+function ff_hevc_add_residual_4x4_neon_8, export=1
+ vldm r1, {q0-q1}
+ vld1.32 d4[0], [r0], r2
+ vld1.32 d4[1], [r0], r2
+ vld1.32 d5[0], [r0], r2
+ vld1.32 d5[1], [r0], r2
+ sub r0, r0, r2, lsl #2
+ vmovl.u8 q8, d4
+ vmovl.u8 q9, d5
+ vqadd.s16 q0, q0, q8
+ vqadd.s16 q1, q1, q9
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst1.32 d0[0], [r0], r2
+ vst1.32 d0[1], [r0], r2
+ vst1.32 d1[0], [r0], r2
+ vst1.32 d1[1], [r0], r2
+ bx lr
+endfunc
+
+function ff_hevc_add_residual_8x8_neon_8, export=1
+ mov r3, #8
+1: subs r3, #1
+ vld1.16 {q0}, [r1]!
+ vld1.8 d16, [r0]
+ vmovl.u8 q8, d16
+ vqadd.s16 q0, q8
+ vqmovun.s16 d0, q0
+ vst1.32 d0, [r0], r2
+ bne 1b
+ bx lr
+endfunc
+
+function ff_hevc_add_residual_16x16_neon_8, export=1
+ mov r3, #16
+1: subs r3, #1
+ vld1.16 {q0, q1}, [r1]!
+ vld1.8 {q8}, [r0]
+ vmovl.u8 q9, d16
+ vmovl.u8 q10, d17
+ vqadd.s16 q0, q9
+ vqadd.s16 q1, q10
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vst1.8 {q0}, [r0], r2
+ bne 1b
+ bx lr
+endfunc
+
+function ff_hevc_add_residual_32x32_neon_8, export=1
+ mov r3, #32
+1: subs r3, #1
+ vldm r1!, {q0-q3}
+ vld1.8 {q8, q9}, [r0]
+ vmovl.u8 q10, d16
+ vmovl.u8 q11, d17
+ vmovl.u8 q12, d18
+ vmovl.u8 q13, d19
+ vqadd.s16 q0, q10
+ vqadd.s16 q1, q11
+ vqadd.s16 q2, q12
+ vqadd.s16 q3, q13
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+ vqmovun.s16 d3, q3
+ vst1.8 {q0, q1}, [r0], r2
+ bne 1b
+ bx lr
+endfunc
+
+.macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7
+ vtrn.64 \r0, \r4
+ vtrn.64 \r1, \r5
+ vtrn.64 \r2, \r6
+ vtrn.64 \r3, \r7
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.32 \r4, \r6
+ vtrn.32 \r5, \r7
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+ vtrn.16 \r4, \r5
+ vtrn.16 \r6, \r7
+.endm
+
+// in 4 q regs
+// output 8 d regs
+.macro transpose_16b_4x4 r0, r1, r2, r3
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+/* uses registers q2 - q9 for temp values */
+/* TODO: reorder */
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+ vaddl.s16 q5, \r0, \r2 // c0 = src0 + src2
+ vaddl.s16 q2, \r2, \r3 // c1 = src2 + src3
+ vsubl.s16 q4, \r0, \r3 // c2 = src0 - src3
+ vmull.s16 q6, \r1, d0[0] // c3 = 74 * src1
+
+ vaddl.s16 q7, \r0, \r3 // src0 + src3
+ vsubw.s16 q7, q7, \r2 // src0 - src2 + src3
+ vmul.s32 q7, q7, d0[0] // dst2 = 74 * (src0 - src2 + src3)
+
+ vmul.s32 q8, q5, d0[1] // 29 * c0
+ vmul.s32 q9, q2, d1[0] // 55 * c1
+ vadd.s32 q8, q9 // 29 * c0 + 55 * c1
+ vadd.s32 q8, q6 // dst0 = 29 * c0 + 55 * c1 + c3
+
+ vmul.s32 q2, q2, d0[1] // 29 * c1
+ vmul.s32 q9, q4, d1[0] // 55 * c2
+ vsub.s32 q9, q2 // 55 * c2 - 29 * c1
+ vadd.s32 q9, q6 // dst1 = 55 * c2 - 29 * c1 + c3
+
+ vmul.s32 q5, q5, d1[0] // 55 * c0
+ vmul.s32 q4, q4, d0[1] // 29 * c2
+ vadd.s32 q5, q4 // 55 * c0 + 29 * c2
+ vsub.s32 q5, q6 // dst3 = 55 * c0 + 29 * c2 - c3
+
+ vqrshrn.s32 \r0, q8, \shift
+ vqrshrn.s32 \r1, q9, \shift
+ vqrshrn.s32 \r2, q7, \shift
+ vqrshrn.s32 \r3, q5, \shift
+.endm
+
+/* uses registers q2 - q6 for temp values */
+.macro tr4 r0, r1, r2, r3
+ vmull.s16 q4, \r1, d0[0] // 83 * src1
+ vmull.s16 q6, \r1, d0[1] // 36 * src1
+ vshll.s16 q2, \r0, #6 // 64 * src0
+ vshll.s16 q3, \r2, #6 // 64 * src2
+ vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0
+ vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1
+ vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0
+ vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1
+
+ vsub.s32 q3, q5, q4 // e0 - o0
+ vadd.s32 q4, q5, q4 // e0 + o0
+ vadd.s32 q5, q2, q6 // e1 + o1
+ vsub.s32 q6, q2, q6 // e1 - o1
+.endm
+
+.macro tr4_shift r0, r1, r2, r3, shift
+ vmull.s16 q4, \r1, d0[0] // 83 * src1
+ vmull.s16 q6, \r1, d0[1] // 36 * src1
+ vshll.s16 q2, \r0, #6 // 64 * src0
+ vshll.s16 q3, \r2, #6 // 64 * src2
+ vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0
+ vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1
+ vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0
+ vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1
+
+ vsub.s32 q3, q5, q4 // e0 - o0
+ vadd.s32 q4, q5, q4 // e0 + o0
+ vadd.s32 q5, q2, q6 // e1 + o1
+ vsub.s32 q6, q2, q6 // e1 - o1
+
+ vqrshrn.s32 \r0, q4, \shift
+ vqrshrn.s32 \r1, q5, \shift
+ vqrshrn.s32 \r2, q6, \shift
+ vqrshrn.s32 \r3, q3, \shift
+.endm
+
+function ff_hevc_transform_4x4_neon_8, export=1
+ vpush {d8-d15}
+ vld1.16 {q14, q15}, [r0] // coeffs
+ ldr r3, =0x00240053 // 36 and 83
+ vmov.32 d0[0], r3
+
+ tr4_shift d28, d29, d30, d31, #7
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+
+ tr4_shift d28, d29, d30, d31, #12
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+
+ vst1.16 {q14, q15}, [r0]
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+ vpush {d8-d15}
+ vld1.16 {q14, q15}, [r0] // coeffs
+ ldr r3, =0x4a // 74
+ vmov.32 d0[0], r3
+ ldr r3, =0x1d // 29
+ vmov.32 d0[1], r3
+ ldr r3, =0x37 // 55
+ vmov.32 d1[0], r3
+
+ tr4_luma_shift d28, d29, d30, d31, #7
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+
+ tr4_luma_shift d28, d29, d30, d31, #12
+
+ vtrn.16 d28, d29
+ vtrn.16 d30, d31
+ vtrn.32 q14, q15
+ vst1.16 {q14, q15}, [r0]
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+.macro tr8_begin in0, in1, in2, in3
+ vmull.s16 q7, \in0, d1[1] // 89 * src1
+ vmull.s16 q8, \in0, d1[0] // 75 * src1
+ vmull.s16 q9, \in0, d1[3] // 50 * src1
+ vmull.s16 q10, \in0, d1[2] // 18 * src1
+
+ vmlal.s16 q7, \in1, d1[0] // 75 * src3
+ vmlsl.s16 q8, \in1, d1[2] //-18 * src3
+ vmlsl.s16 q9, \in1, d1[1] //-89 * src3
+ vmlsl.s16 q10, \in1, d1[3] //-50 * src3
+
+ vmlal.s16 q7, \in2, d1[3] // 50 * src5
+ vmlsl.s16 q8, \in2, d1[1] //-89 * src5
+ vmlal.s16 q9, \in2, d1[2] // 18 * src5
+ vmlal.s16 q10, \in2, d1[0] // 75 * src5
+
+ vmlal.s16 q7, \in3, d1[2] // 18 * src7
+ vmlsl.s16 q8, \in3, d1[3] //-50 * src7
+ vmlal.s16 q9, \in3, d1[0] // 75 * src7
+ vmlsl.s16 q10, \in3, d1[1] //-89 * src7
+.endm
+
+.macro tr8_end shift
+ vadd.s32 q1, q4, q7 // e_8[0] + o_8[0], dst[0]
+ vsub.s32 q4, q4, q7 // e_8[0] - o_8[0], dst[7]
+
+ vadd.s32 q2, q5, q8 // e_8[1] + o_8[1], dst[1]
+ vsub.s32 q5, q5, q8 // e_8[1] - o_8[1], dst[6]
+
+ vadd.s32 q11, q6, q9 // e_8[2] + o_8[2], dst[2]
+ vsub.s32 q6, q6, q9 // e_8[2] - o_8[2], dst[5]
+
+ vadd.s32 q12, q3, q10 // e_8[3] + o_8[3], dst[3]
+ vsub.s32 q3, q3, q10 // e_8[3] - o_8[3], dst[4]
+ vqrshrn.s32 d2, q1, \shift
+ vqrshrn.s32 d3, q2, \shift
+ vqrshrn.s32 d4, q11, \shift
+ vqrshrn.s32 d5, q12, \shift
+ vqrshrn.s32 d6, q3, \shift
+ vqrshrn.s32 d7, q6, \shift
+ vqrshrn.s32 d9, q4, \shift
+ vqrshrn.s32 d8, q5, \shift
+.endm
+
+function ff_hevc_transform_8x8_neon_8, export=1
+ push {r4-r8}
+ vpush {d8-d15}
+ mov r5, #16
+
+ adr r3, tr4f
+ vld1.16 {d0, d1}, [r3]
+
+ // left half
+ vld1.16 {d24}, [r0], r5
+ vld1.16 {d25}, [r0], r5
+ vld1.16 {d26}, [r0], r5
+ vld1.16 {d27}, [r0], r5
+ vld1.16 {d28}, [r0], r5
+ vld1.16 {d29}, [r0], r5
+ vld1.16 {d30}, [r0], r5
+ vld1.16 {d31}, [r0], r5
+ sub r0, #128
+ tr8_begin d25, d27, d29, d31
+ tr4 d24, d26, d28, d30
+ tr8_end #7
+ vst1.16 {d2}, [r0], r5
+ vst1.16 {d3}, [r0], r5
+ vst1.16 {d4}, [r0], r5
+ vst1.16 {d5}, [r0], r5
+ vst1.16 {d6}, [r0], r5
+ vst1.16 {d7}, [r0], r5
+ vst1.16 {d8}, [r0], r5
+ vst1.16 {d9}, [r0], r5
+ sub r0, #128
+ //skip right half if col_limit in r1 is less than 4
+ cmp r1, #4
+ blt 1f
+ //right half
+ add r0, #8
+ vld1.16 {d24}, [r0], r5
+ vld1.16 {d25}, [r0], r5
+ vld1.16 {d26}, [r0], r5
+ vld1.16 {d27}, [r0], r5
+ vld1.16 {d28}, [r0], r5
+ vld1.16 {d29}, [r0], r5
+ vld1.16 {d30}, [r0], r5
+ vld1.16 {d31}, [r0], r5
+ sub r0, #128
+ tr8_begin d25, d27, d29, d31
+ tr4 d24, d26, d28, d30
+ tr8_end #7
+ vst1.16 {d2}, [r0], r5
+ vst1.16 {d3}, [r0], r5
+ vst1.16 {d4}, [r0], r5
+ vst1.16 {d5}, [r0], r5
+ vst1.16 {d6}, [r0], r5
+ vst1.16 {d7}, [r0], r5
+ vst1.16 {d8}, [r0], r5
+ vst1.16 {d9}, [r0], r5
+ sub r0, #136
+1:
+ // top half
+ vldm r0, {q12-q15} // coeffs
+ transpose_16b_4x4 d24, d26, d28, d30
+ transpose_16b_4x4 d25, d27, d29, d31
+ tr8_begin d26, d30, d27, d31
+ tr4 d24, d28, d25, d29
+ tr8_end #12
+ transpose_16b_4x4 d2, d3, d4, d5
+ transpose_16b_4x4 d6, d7, d8, d9
+ vswp d7, d5
+ vswp d7, d8
+ vswp d3, d6
+ vswp d6, d4
+ vstm r0!, {q1-q4}
+
+ // bottom half
+ vldm r0, {q12-q15} // coeffs
+ transpose_16b_4x4 d24, d26, d28, d30
+ transpose_16b_4x4 d25, d27, d29, d31
+ tr8_begin d26, d30, d27, d31
+ tr4 d24, d28, d25, d29
+ tr8_end #12
+ transpose_16b_4x4 d2, d3, d4, d5
+ transpose_16b_4x4 d6, d7, d8, d9
+ vswp d7, d5
+ vswp d7, d8
+ vswp d3, d6
+ vswp d6, d4
+ //vstm r0, {q1-q4}
+ vst1.16 {q1-q2}, [r0]
+ add r0, #32
+ vst1.16 {q3-q4}, [r0]
+ sub r0, #32
+ vpop {d8-d15}
+ pop {r4-r8}
+ bx lr
+endfunc
+
+.align 4
+tr4f:
+.word 0x00240053 // 36 and d1[0] = 83
+.word 0x00000000
+tr8f:
+.word 0x0059004b // 89, d0[0] = 75
+.word 0x00320012 // 50, d0[2] = 18
+tr16:
+.word 0x005a0057 // 90, d2[0] = 87
+.word 0x00500046 // 80, d2[2] = 70
+.word 0x0039002b // 57, d2[0] = 43
+.word 0x00190009 // 25, d2[2] = 9
diff --git a/libavcodec/arm/hevcdsp_init_arm.c b/libavcodec/arm/hevcdsp_init_arm.c
new file mode 100644
index 0000000000..adcc454511
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_init_arm.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_arm.h"
+
+av_cold void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ ff_hevcdsp_init_neon(c, bit_depth);
+}
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
new file mode 100644
index 0000000000..1a3912c609
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_arm.h"
+
+void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+ ptrdiff_t stride);
+
+#define PUT_PIXELS(name) \
+ void name(int16_t *dst, uint8_t *src, \
+ ptrdiff_t srcstride, int height, \
+ intptr_t mx, intptr_t my, int width)
+PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+#undef PUT_PIXELS
+
+static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, int width);
+static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int width, int height, int16_t* src2, ptrdiff_t src2stride);
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width);
+#define QPEL_FUNC(name) \
+ void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
+ int height, int width)
+
+QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
+#undef QPEL_FUNC
+
+#define QPEL_FUNC_UW_PIX(name) \
+ void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+ int height, intptr_t mx, intptr_t my, int width);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w4_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w8_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w16_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w24_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w32_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w48_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w64_neon_8);
+#undef QPEL_FUNC_UW_PIX
+
+#define QPEL_FUNC_UW(name) \
+ void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+ int width, int height, int16_t* src2, ptrdiff_t src2stride);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
+#undef QPEL_FUNC_UW
+
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width) {
+
+ put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
+}
+
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int height, intptr_t mx, intptr_t my, int width) {
+
+ put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
+}
+
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width) {
+ put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+}
+
+av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+ if (bit_depth == 8) {
+ int x;
+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon;
+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon;
+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon;
+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon;
+ c->idct[0] = ff_hevc_transform_4x4_neon_8;
+ c->idct[1] = ff_hevc_transform_8x8_neon_8;
+ c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8;
+ c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_8;
+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_8;
+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_8;
+ c->add_residual[0] = ff_hevc_add_residual_4x4_neon_8;
+ c->add_residual[1] = ff_hevc_add_residual_8x8_neon_8;
+ c->add_residual[2] = ff_hevc_add_residual_16x16_neon_8;
+ c->add_residual[3] = ff_hevc_add_residual_32x32_neon_8;
+ c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
+ put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8;
+ put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8;
+ put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8;
+ put_hevc_qpel_neon[0][1] = ff_hevc_put_qpel_h1_neon_8;
+ put_hevc_qpel_neon[0][2] = ff_hevc_put_qpel_h2_neon_8;
+ put_hevc_qpel_neon[0][3] = ff_hevc_put_qpel_h3_neon_8;
+ put_hevc_qpel_neon[1][1] = ff_hevc_put_qpel_h1v1_neon_8;
+ put_hevc_qpel_neon[1][2] = ff_hevc_put_qpel_h2v1_neon_8;
+ put_hevc_qpel_neon[1][3] = ff_hevc_put_qpel_h3v1_neon_8;
+ put_hevc_qpel_neon[2][1] = ff_hevc_put_qpel_h1v2_neon_8;
+ put_hevc_qpel_neon[2][2] = ff_hevc_put_qpel_h2v2_neon_8;
+ put_hevc_qpel_neon[2][3] = ff_hevc_put_qpel_h3v2_neon_8;
+ put_hevc_qpel_neon[3][1] = ff_hevc_put_qpel_h1v3_neon_8;
+ put_hevc_qpel_neon[3][2] = ff_hevc_put_qpel_h2v3_neon_8;
+ put_hevc_qpel_neon[3][3] = ff_hevc_put_qpel_h3v3_neon_8;
+ put_hevc_qpel_uw_neon[1][0] = ff_hevc_put_qpel_uw_v1_neon_8;
+ put_hevc_qpel_uw_neon[2][0] = ff_hevc_put_qpel_uw_v2_neon_8;
+ put_hevc_qpel_uw_neon[3][0] = ff_hevc_put_qpel_uw_v3_neon_8;
+ put_hevc_qpel_uw_neon[0][1] = ff_hevc_put_qpel_uw_h1_neon_8;
+ put_hevc_qpel_uw_neon[0][2] = ff_hevc_put_qpel_uw_h2_neon_8;
+ put_hevc_qpel_uw_neon[0][3] = ff_hevc_put_qpel_uw_h3_neon_8;
+ put_hevc_qpel_uw_neon[1][1] = ff_hevc_put_qpel_uw_h1v1_neon_8;
+ put_hevc_qpel_uw_neon[1][2] = ff_hevc_put_qpel_uw_h2v1_neon_8;
+ put_hevc_qpel_uw_neon[1][3] = ff_hevc_put_qpel_uw_h3v1_neon_8;
+ put_hevc_qpel_uw_neon[2][1] = ff_hevc_put_qpel_uw_h1v2_neon_8;
+ put_hevc_qpel_uw_neon[2][2] = ff_hevc_put_qpel_uw_h2v2_neon_8;
+ put_hevc_qpel_uw_neon[2][3] = ff_hevc_put_qpel_uw_h3v2_neon_8;
+ put_hevc_qpel_uw_neon[3][1] = ff_hevc_put_qpel_uw_h1v3_neon_8;
+ put_hevc_qpel_uw_neon[3][2] = ff_hevc_put_qpel_uw_h2v3_neon_8;
+ put_hevc_qpel_uw_neon[3][3] = ff_hevc_put_qpel_uw_h3v3_neon_8;
+ for (x = 0; x < 10; x++) {
+ c->put_hevc_qpel[x][1][0] = ff_hevc_put_qpel_neon_wrapper;
+ c->put_hevc_qpel[x][0][1] = ff_hevc_put_qpel_neon_wrapper;
+ c->put_hevc_qpel[x][1][1] = ff_hevc_put_qpel_neon_wrapper;
+ c->put_hevc_qpel_uni[x][1][0] = ff_hevc_put_qpel_uni_neon_wrapper;
+ c->put_hevc_qpel_uni[x][0][1] = ff_hevc_put_qpel_uni_neon_wrapper;
+ c->put_hevc_qpel_uni[x][1][1] = ff_hevc_put_qpel_uni_neon_wrapper;
+ c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper;
+ c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper;
+ c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper;
+ }
+ c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
+ c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
+ c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
+ c->put_hevc_qpel[3][0][0] = ff_hevc_put_pixels_w8_neon_8;
+ c->put_hevc_qpel[4][0][0] = ff_hevc_put_pixels_w12_neon_8;
+ c->put_hevc_qpel[5][0][0] = ff_hevc_put_pixels_w16_neon_8;
+ c->put_hevc_qpel[6][0][0] = ff_hevc_put_pixels_w24_neon_8;
+ c->put_hevc_qpel[7][0][0] = ff_hevc_put_pixels_w32_neon_8;
+ c->put_hevc_qpel[8][0][0] = ff_hevc_put_pixels_w48_neon_8;
+ c->put_hevc_qpel[9][0][0] = ff_hevc_put_pixels_w64_neon_8;
+
+ c->put_hevc_qpel_uni[1][0][0] = ff_hevc_put_qpel_uw_pixels_w4_neon_8;
+ c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_qpel_uw_pixels_w8_neon_8;
+ c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_qpel_uw_pixels_w16_neon_8;
+ c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_qpel_uw_pixels_w24_neon_8;
+ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
+ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+ }
+}
diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S b/libavcodec/arm/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..86f92cf75a
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_qpel_neon.S
@@ -0,0 +1,999 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+
+.macro regshuffle_d8
+ vmov d16, d17
+ vmov d17, d18
+ vmov d18, d19
+ vmov d19, d20
+ vmov d20, d21
+ vmov d21, d22
+ vmov d22, d23
+.endm
+
+.macro regshuffle_q8
+ vmov q0, q1
+ vmov q1, q2
+ vmov q2, q3
+ vmov q3, q4
+ vmov q4, q5
+ vmov q5, q6
+ vmov q6, q7
+.endm
+
+.macro vextin8
+ pld [r2]
+ vld1.8 {q11}, [r2], r3
+ vext.8 d16, d22, d23, #1
+ vext.8 d17, d22, d23, #2
+ vext.8 d18, d22, d23, #3
+ vext.8 d19, d22, d23, #4
+ vext.8 d20, d22, d23, #5
+ vext.8 d21, d22, d23, #6
+ vext.8 d22, d22, d23, #7
+.endm
+
+.macro loadin8
+ pld [r2]
+ vld1.8 {d16}, [r2], r3
+ pld [r2]
+ vld1.8 {d17}, [r2], r3
+ pld [r2]
+ vld1.8 {d18}, [r2], r3
+ pld [r2]
+ vld1.8 {d19}, [r2], r3
+ pld [r2]
+ vld1.8 {d20}, [r2], r3
+ pld [r2]
+ vld1.8 {d21}, [r2], r3
+ pld [r2]
+ vld1.8 {d22}, [r2], r3
+ pld [r2]
+ vld1.8 {d23}, [r2], r3
+.endm
+
+.macro qpel_filter_1_32b
+ vmov.i16 d16, #58
+ vmov.i16 d17, #10
+ vmull.s16 q9, d6, d16 // 58 * d0
+ vmull.s16 q10, d7, d16 // 58 * d1
+ vmov.i16 d16, #17
+ vmull.s16 q11, d4, d17 // 10 * c0
+ vmull.s16 q12, d5, d17 // 10 * c1
+ vmov.i16 d17, #5
+ vmull.s16 q13, d8, d16 // 17 * e0
+ vmull.s16 q14, d9, d16 // 17 * e1
+ vmull.s16 q15, d10, d17 // 5 * f0
+ vmull.s16 q8, d11, d17 // 5 * f1
+ vsub.s32 q9, q11 // 58 * d0 - 10 * c0
+ vsub.s32 q10, q12 // 58 * d1 - 10 * c1
+ vshll.s16 q11, d2, #2 // 4 * b0
+ vshll.s16 q12, d3, #2 // 4 * b1
+ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0
+ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1
+ vsubl.s16 q13, d12, d0 // g0 - a0
+ vsubl.s16 q14, d13, d1 // g1 - a1
+ vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+ vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+ vsub.s32 q13, q15 // g0 - a0 - 5 * f0
+ vsub.s32 q14, q8 // g1 - a1 - 5 * f1
+ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+ vqshrn.s32 d16, q9, #6
+ vqshrn.s32 d17, q10, #6
+.endm
+
+// input q0 - q7
+// output q8
+.macro qpel_filter_2_32b
+ vmov.i32 q8, #11
+ vaddl.s16 q9, d6, d8 // d0 + e0
+ vaddl.s16 q10, d7, d9 // d1 + e1
+ vaddl.s16 q11, d4, d10 // c0 + f0
+ vaddl.s16 q12, d5, d11 // c1 + f1
+ vmul.s32 q11, q8 // 11 * (c0 + f0)
+ vmul.s32 q12, q8 // 11 * (c1 + f1)
+ vmov.i32 q8, #40
+ vaddl.s16 q15, d2, d12 // b0 + g0
+ vmul.s32 q9, q8 // 40 * (d0 + e0)
+ vmul.s32 q10, q8 // 40 * (d1 + e1)
+ vaddl.s16 q8, d3, d13 // b1 + g1
+ vaddl.s16 q13, d0, d14 // a0 + h0
+ vaddl.s16 q14, d1, d15 // a1 + h1
+ vshl.s32 q15, #2 // 4*(b0+g0)
+ vshl.s32 q8, #2 // 4*(b1+g1)
+ vadd.s32 q11, q13 // 11 * (c0 + f0) + a0 + h0
+ vadd.s32 q12, q14 // 11 * (c1 + f1) + a1 + h1
+ vadd.s32 q9, q15 // 40 * (d0 + e0) + 4*(b0+g0)
+ vadd.s32 q10, q8 // 40 * (d1 + e1) + 4*(b1+g1)
+ vsub.s32 q9, q11 // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
+ vsub.s32 q10, q12 // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
+ vqshrn.s32 d16, q9, #6
+ vqshrn.s32 d17, q10, #6
+.endm
+
+.macro qpel_filter_3_32b
+ vmov.i16 d16, #58
+ vmov.i16 d17, #10
+ vmull.s16 q9, d8, d16 // 58 * d0
+ vmull.s16 q10, d9, d16 // 58 * d1
+ vmov.i16 d16, #17
+ vmull.s16 q11, d10, d17 // 10 * c0
+ vmull.s16 q12, d11, d17 // 10 * c1
+ vmov.i16 d17, #5
+ vmull.s16 q13, d6, d16 // 17 * e0
+ vmull.s16 q14, d7, d16 // 17 * e1
+ vmull.s16 q15, d4, d17 // 5 * f0
+ vmull.s16 q8, d5, d17 // 5 * f1
+ vsub.s32 q9, q11 // 58 * d0 - 10 * c0
+ vsub.s32 q10, q12 // 58 * d1 - 10 * c1
+ vshll.s16 q11, d12, #2 // 4 * b0
+ vshll.s16 q12, d13, #2 // 4 * b1
+ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0
+ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1
+ vsubl.s16 q13, d2, d14 // g0 - a0
+ vsubl.s16 q14, d3, d15 // g1 - a1
+ vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+ vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+ vsub.s32 q13, q15 // g0 - a0 - 5 * f0
+ vsub.s32 q14, q8 // g1 - a1 - 5 * f1
+ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+ vqshrn.s32 d16, q9, #6
+ vqshrn.s32 d17, q10, #6
+.endm
+
+.macro qpel_filter_1 out=q7
+ vmov.u8 d24, #58
+ vmov.u8 d25, #10
+ vshll.u8 q13, d20, #4 // 16*e
+ vshll.u8 q14, d21, #2 // 4*f
+ vmull.u8 \out, d19, d24 // 58*d
+ vaddw.u8 q13, q13, d20 // 17*e
+ vmull.u8 q15, d18, d25 // 10*c
+ vaddw.u8 q14, q14, d21 // 5*f
+ vsubl.u8 q12, d22, d16 // g - a
+ vadd.u16 \out, q13 // 58d + 17e
+ vshll.u8 q13, d17, #2 // 4*b
+ vadd.u16 q15, q14 // 10*c + 5*f
+ vadd.s16 q13, q12 // - a + 4*b + g
+ vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f
+ vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro qpel_filter_2 out=q7
+ vmov.i16 q12, #10
+ vmov.i16 q14, #11
+ vaddl.u8 q13, d19, d20 // d + e
+ vaddl.u8 q15, d18, d21 // c + f
+ vmul.u16 q13, q12 // 10 * (d+e)
+ vmul.u16 q15, q14 // 11 * ( c + f)
+ vaddl.u8 \out, d17, d22 // b + g
+ vaddl.u8 q12, d16, d23 // a + h
+ vadd.u16 \out, q13 // b + 10 * (d + e) + g
+ vadd.s16 q12, q15
+ vshl.u16 \out, #2 // 4 * (b + 10 * (d + e) + g)
+ vsub.s16 \out, q12
+.endm
+
+.macro qpel_filter_3 out=q7
+ vmov.u8 d24, #58
+ vmov.u8 d25, #10
+ vshll.u8 q13, d19, #4 // 16*e
+ vshll.u8 q14, d18, #2 // 4*f
+ vmull.u8 \out, d20, d24 // 58*d
+ vaddw.u8 q13, q13, d19 // 17*e
+ vmull.u8 q15, d21, d25 // 10*c
+ vaddw.u8 q14, q14, d18 // 5*f
+ vsubl.u8 q12, d17, d23 // g - a
+ vadd.u16 \out, q13 // 58d + 17e
+ vshll.u8 q13, d22, #2 // 4*b
+ vadd.u16 q15, q14 // 10*c + 5*f
+ vadd.s16 q13, q12 // - a + 4*b + g
+ vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f
+ vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro hevc_put_qpel_vX_neon_8 filter
+ push {r4, r5, r6, r7}
+ ldr r4, [sp, #16] // height
+ ldr r5, [sp, #20] // width
+ vpush {d8-d15}
+ sub r2, r2, r3, lsl #1
+ sub r2, r3
+ mov r12, r4
+ mov r6, r0
+ mov r7, r2
+ lsl r1, #1
+0: loadin8
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ \filter
+ vst1.16 {q7}, [r0], r1
+ regshuffle_d8
+ vld1.8 {d23}, [r2], r3
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #16
+ mov r0, r6
+ add r7, #8
+ mov r2, r7
+ b 0b
+4: subs r4, #1
+ \filter
+ vst1.16 d14, [r0], r1
+ regshuffle_d8
+ vld1.32 {d23[0]}, [r2], r3
+ bne 4b
+99: vpop {d8-d15}
+ pop {r4, r5, r6, r7}
+ bx lr
+.endm
+
+.macro hevc_put_qpel_uw_vX_neon_8 filter
+ push {r4-r10}
+ ldr r5, [sp, #28] // width
+ ldr r4, [sp, #32] // height
+ ldr r8, [sp, #36] // src2
+ ldr r9, [sp, #40] // src2stride
+ vpush {d8-d15}
+ sub r2, r2, r3, lsl #1
+ sub r2, r3
+ mov r12, r4
+ mov r6, r0
+ mov r7, r2
+ cmp r8, #0
+ bne .Lbi\@
+0: loadin8
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ \filter
+ vqrshrun.s16 d0, q7, #6
+ vst1.8 d0, [r0], r1
+ regshuffle_d8
+ vld1.8 {d23}, [r2], r3
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #8
+ mov r0, r6
+ add r7, #8
+ mov r2, r7
+ b 0b
+4: subs r4, #1
+ \filter
+ vqrshrun.s16 d0, q7, #6
+ vst1.32 d0[0], [r0], r1
+ regshuffle_d8
+ vld1.32 {d23[0]}, [r2], r3
+ bne 4b
+ b 99f
+.Lbi\@: lsl r9, #1
+ mov r10, r8
+0: loadin8
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ \filter
+ vld1.16 {q0}, [r8], r9
+ vqadd.s16 q0, q7
+ vqrshrun.s16 d0, q0, #7
+ vst1.8 d0, [r0], r1
+ regshuffle_d8
+ vld1.8 {d23}, [r2], r3
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #8
+ mov r0, r6
+ add r10, #16
+ mov r8, r10
+ add r7, #8
+ mov r2, r7
+ b 0b
+4: subs r4, #1
+ \filter
+ vld1.16 d0, [r8], r9
+ vqadd.s16 d0, d14
+ vqrshrun.s16 d0, q0, #7
+ vst1.32 d0[0], [r0], r1
+ regshuffle_d8
+ vld1.32 {d23[0]}, [r2], r3
+ bne 4b
+99: vpop {d8-d15}
+ pop {r4-r10}
+ bx lr
+.endm
+
+function ff_hevc_put_qpel_v1_neon_8, export=1
+ hevc_put_qpel_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_v2_neon_8, export=1
+ hevc_put_qpel_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_v3_neon_8, export=1
+ hevc_put_qpel_vX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_v1_neon_8, export=1
+ hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_v2_neon_8, export=1
+ hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_v3_neon_8, export=1
+ hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hX_neon_8 filter
+ push {r4, r5, r6, r7}
+ ldr r4, [sp, #16] // height
+ ldr r5, [sp, #20] // width
+
+ vpush {d8-d15}
+ sub r2, #4
+ lsl r1, #1
+ mov r12, r4
+ mov r6, r0
+ mov r7, r2
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ vextin8
+ \filter
+ vst1.16 {q7}, [r0], r1
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #16
+ mov r0, r6
+ add r7, #8
+ mov r2, r7
+ cmp r5, #4
+ bne 8b
+4: subs r4, #1
+ vextin8
+ \filter
+ vst1.16 d14, [r0], r1
+ bne 4b
+99: vpop {d8-d15}
+ pop {r4, r5, r6, r7}
+ bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hX_neon_8 filter
+ push {r4-r10}
+ ldr r5, [sp, #28] // width
+ ldr r4, [sp, #32] // height
+ ldr r8, [sp, #36] // src2
+ ldr r9, [sp, #40] // src2stride
+ vpush {d8-d15}
+ sub r2, #4
+ mov r12, r4
+ mov r6, r0
+ mov r7, r2
+ cmp r8, #0
+ bne .Lbi\@
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ vextin8
+ \filter
+ vqrshrun.s16 d0, q7, #6
+ vst1.8 d0, [r0], r1
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #8
+ mov r0, r6
+ add r7, #8
+ mov r2, r7
+ cmp r5, #4
+ bne 8b
+4: subs r4, #1
+ vextin8
+ \filter
+ vqrshrun.s16 d0, q7, #6
+ vst1.32 d0[0], [r0], r1
+ bne 4b
+ b 99f
+.Lbi\@:
+ lsl r9, #1
+ cmp r5, #4
+ beq 4f
+ mov r10, r8
+8: subs r4, #1
+ vextin8
+ \filter
+ vld1.16 {q0}, [r8], r9
+ vqadd.s16 q0, q7
+ vqrshrun.s16 d0, q0, #7
+ vst1.8 d0, [r0], r1
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #8
+ add r10, #16
+ mov r8, r10
+ mov r0, r6
+ add r7, #8
+ mov r2, r7
+ cmp r5, #4
+ bne 8b
+4: subs r4, #1
+ vextin8
+ \filter
+ vld1.16 d0, [r8], r9
+ vqadd.s16 d0, d14
+ vqrshrun.s16 d0, q0, #7
+ vst1.32 d0[0], [r0], r1
+ bne 4b
+99: vpop {d8-d15}
+ pop {r4-r10}
+ bx lr
+.endm
+
+function ff_hevc_put_qpel_h1_neon_8, export=1
+ hevc_put_qpel_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_h2_neon_8, export=1
+ hevc_put_qpel_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_h3_neon_8, export=1
+ hevc_put_qpel_hX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1_neon_8, export=1
+ hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_h2_neon_8, export=1
+ hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_h3_neon_8, export=1
+ hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
+ push {r4, r5, r6, r7}
+ ldr r4, [sp, #16] // height
+ ldr r5, [sp, #20] // width
+
+ vpush {d8-d15}
+ sub r2, #4
+ sub r2, r2, r3, lsl #1
+ sub r2, r3 // extra_before 3
+ lsl r1, #1
+ mov r12, r4
+ mov r6, r0
+ mov r7, r2
+0: vextin8
+ \filterh q0
+ vextin8
+ \filterh q1
+ vextin8
+ \filterh q2
+ vextin8
+ \filterh q3
+ vextin8
+ \filterh q4
+ vextin8
+ \filterh q5
+ vextin8
+ \filterh q6
+ vextin8
+ \filterh q7
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ \filterv
+ vst1.16 {q8}, [r0], r1
+ regshuffle_q8
+ vextin8
+ \filterh q7
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #16
+ mov r0, r6
+ add r7, #8
+ mov r2, r7
+ b 0b
+4: subs r4, #1
+ \filterv
+ vst1.16 d16, [r0], r1
+ regshuffle_q8
+ vextin8
+ \filterh q7
+ bne 4b
+99: vpop {d8-d15}
+ pop {r4, r5, r6, r7}
+ bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
+ push {r4-r10}
+ ldr r5, [sp, #28] // width
+ ldr r4, [sp, #32] // height
+ ldr r8, [sp, #36] // src2
+ ldr r9, [sp, #40] // src2stride
+ vpush {d8-d15}
+ sub r2, #4
+ sub r2, r2, r3, lsl #1
+ sub r2, r3 // extra_before 3
+ mov r12, r4
+ mov r6, r0
+ mov r7, r2
+ cmp r8, #0
+ bne .Lbi\@
+0: vextin8
+ \filterh q0
+ vextin8
+ \filterh q1
+ vextin8
+ \filterh q2
+ vextin8
+ \filterh q3
+ vextin8
+ \filterh q4
+ vextin8
+ \filterh q5
+ vextin8
+ \filterh q6
+ vextin8
+ \filterh q7
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ \filterv
+ vqrshrun.s16 d0, q8, #6
+ vst1.8 d0, [r0], r1
+ regshuffle_q8
+ vextin8
+ \filterh q7
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #8
+ mov r0, r6
+ add r7, #8
+ mov r2, r7
+ b 0b
+4: subs r4, #1
+ \filterv
+ vqrshrun.s16 d0, q8, #6
+ vst1.32 d0[0], [r0], r1
+ regshuffle_q8
+ vextin8
+ \filterh q7
+ bne 4b
+ b 99f
+.Lbi\@: lsl r9, #1
+ mov r10, r8
+0: vextin8
+ \filterh q0
+ vextin8
+ \filterh q1
+ vextin8
+ \filterh q2
+ vextin8
+ \filterh q3
+ vextin8
+ \filterh q4
+ vextin8
+ \filterh q5
+ vextin8
+ \filterh q6
+ vextin8
+ \filterh q7
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ \filterv
+ vld1.16 {q0}, [r8], r9
+ vqadd.s16 q0, q8
+ vqrshrun.s16 d0, q0, #7
+ vst1.8 d0, [r0], r1
+ regshuffle_q8
+ vextin8
+ \filterh q7
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #8
+ mov r0, r6
+ add r10, #16
+ mov r8, r10
+ add r7, #8
+ mov r2, r7
+ b 0b
+4: subs r4, #1
+ \filterv
+ vld1.16 d0, [r8], r9
+ vqadd.s16 d0, d16
+ vqrshrun.s16 d0, q0, #7
+ vst1.32 d0[0], [r0], r1
+ regshuffle_q8
+ vextin8
+ \filterh q7
+ bne 4b
+99: vpop {d8-d15}
+ pop {r4-r10}
+ bx lr
+.endm
+
+
+function ff_hevc_put_qpel_h1v1_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v1_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v1_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v2_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v2_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v2_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v3_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v3_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v3_neon_8, export=1
+ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1
+ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+.macro init_put_pixels
+ pld [r1]
+ pld [r1, r2]
+ mov r12, MAX_PB_SIZE
+ lsl r12, #1
+.endm
+
+function ff_hevc_put_pixels_w2_neon_8, export=1
+ init_put_pixels
+ vmov.u8 d5, #255
+ vshr.u64 d5, #32
+0: subs r3, #1
+ vld1.32 {d0[0]}, [r1], r2
+ pld [r1]
+ vld1.32 d6, [r0]
+ vshll.u8 q0, d0, #6
+ vbit d6, d0, d5
+ vst1.32 d6, [r0], r12
+ bne 0b
+ bx lr
+endfunc
+
+function ff_hevc_put_pixels_w4_neon_8, export=1
+ init_put_pixels
+0: subs r3, #2
+ vld1.32 {d0[0]}, [r1], r2
+ vld1.32 {d0[1]}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ vshll.u8 q0, d0, #6
+ vst1.64 {d0}, [r0], r12
+ vst1.64 {d1}, [r0], r12
+ bne 0b
+ bx lr
+endfunc
+
+function ff_hevc_put_pixels_w6_neon_8, export=1
+ init_put_pixels
+ vmov.u8 q10, #255
+ vshr.u64 d21, #32
+0: subs r3, #1
+ vld1.16 {d0}, [r1], r2
+ pld [r1]
+ vshll.u8 q0, d0, #6
+ vld1.8 {q12}, [r0]
+ vbit q12, q0, q10
+ vst1.8 {q12}, [r0], r12
+ bne 0b
+ bx lr
+endfunc
+
+function ff_hevc_put_pixels_w8_neon_8, export=1
+ init_put_pixels
+0: subs r3, #2
+ vld1.8 {d0}, [r1], r2
+ vld1.8 {d2}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ vshll.u8 q0, d0, #6
+ vshll.u8 q1, d2, #6
+ vst1.16 {q0}, [r0], r12
+ vst1.16 {q1}, [r0], r12
+ bne 0b
+ bx lr
+endfunc
+
+function ff_hevc_put_pixels_w12_neon_8, export=1
+ init_put_pixels
+0: subs r3, #2
+ vld1.64 {d0}, [r1]
+ add r1, #8
+ vld1.32 {d1[0]}, [r1], r2
+ sub r1, #8
+ vld1.64 {d2}, [r1]
+ add r1, #8
+ vld1.32 {d1[1]}, [r1], r2
+ sub r1, #8
+ pld [r1]
+ pld [r1, r2]
+ vshll.u8 q8, d0, #6
+ vshll.u8 q9, d1, #6
+ vshll.u8 q10, d2, #6
+ vmov d22, d19
+ vst1.64 {d16, d17, d18}, [r0], r12
+ vst1.64 {d20, d21, d22}, [r0], r12
+ bne 0b
+ bx lr
+endfunc
+
+function ff_hevc_put_pixels_w16_neon_8, export=1
+ init_put_pixels
+0: subs r3, #2
+ vld1.8 {q0}, [r1], r2
+ vld1.8 {q1}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ vshll.u8 q8, d0, #6
+ vshll.u8 q9, d1, #6
+ vshll.u8 q10, d2, #6
+ vshll.u8 q11, d3, #6
+ vst1.8 {q8, q9}, [r0], r12
+ vst1.8 {q10, q11}, [r0], r12
+ bne 0b
+ bx lr
+endfunc
+
+function ff_hevc_put_pixels_w24_neon_8, export=1
+ init_put_pixels
+0: subs r3, #1
+ vld1.8 {d0, d1, d2}, [r1], r2
+ pld [r1]
+ vshll.u8 q10, d0, #6
+ vshll.u8 q11, d1, #6
+ vshll.u8 q12, d2, #6
+ vstm r0, {q10, q11, q12}
+ add r0, r12
+ bne 0b
+ bx lr
+endfunc
+
+function ff_hevc_put_pixels_w32_neon_8, export=1
+ init_put_pixels
+0: subs r3, #1
+ vld1.8 {q0, q1}, [r1], r2
+ pld [r1]
+ vshll.u8 q8, d0, #6
+ vshll.u8 q9, d1, #6
+ vshll.u8 q10, d2, #6
+ vshll.u8 q11, d3, #6
+ vstm r0, {q8, q9, q10, q11}
+ add r0, r12
+ bne 0b
+ bx lr
+endfunc
+
+function ff_hevc_put_pixels_w48_neon_8, export=1
+ init_put_pixels
+0: subs r3, #1
+ vld1.8 {q0, q1}, [r1]
+ add r1, #32
+ vld1.8 {q2}, [r1], r2
+ sub r1, #32
+ pld [r1]
+ vshll.u8 q8, d0, #6
+ vshll.u8 q9, d1, #6
+ vshll.u8 q10, d2, #6
+ vshll.u8 q11, d3, #6
+ vshll.u8 q12, d4, #6
+ vshll.u8 q13, d5, #6
+ vstm r0, {q8, q9, q10, q11, q12, q13}
+ add r0, r12
+ bne 0b
+ bx lr
+endfunc
+
+function ff_hevc_put_pixels_w64_neon_8, export=1
+ init_put_pixels
+0: subs r3, #1
+ vld1.8 {q0, q1}, [r1]
+ add r1, #32
+ vld1.8 {q2, q3}, [r1], r2
+ sub r1, #32
+ pld [r1]
+ vshll.u8 q8, d0, #6
+ vshll.u8 q9, d1, #6
+ vshll.u8 q10, d2, #6
+ vshll.u8 q11, d3, #6
+ vshll.u8 q12, d4, #6
+ vshll.u8 q13, d5, #6
+ vshll.u8 q14, d6, #6
+ vshll.u8 q15, d7, #6
+ vstm r0, {q8, q9, q10, q11, q12, q13, q14, q15}
+ add r0, r12
+ bne 0b
+ bx lr
+endfunc
+
+function ff_hevc_put_qpel_uw_pixels_neon_8, export=1
+ push {r4-r9}
+ ldr r5, [sp, #24] // width
+ ldr r4, [sp, #28] // height
+ ldr r8, [sp, #32] // src2
+ ldr r9, [sp, #36] // src2stride
+ vpush {d8-d15}
+ cmp r8, #0
+ bne 2f
+1: subs r4, #1
+ vld1.8 {d0}, [r2], r3
+ vst1.8 d0, [r0], r1
+ bne 1b
+ vpop {d8-d15}
+ pop {r4-r9}
+ bx lr
+2: subs r4, #1
+ vld1.8 {d0}, [r2], r3
+ vld1.16 {q1}, [r8], r9
+ vshll.u8 q0, d0, #6
+ vqadd.s16 q0, q1
+ vqrshrun.s16 d0, q0, #7
+ vst1.8 d0, [r0], r1
+ bne 2b
+ vpop {d8-d15}
+ pop {r4-r9}
+ bx lr
+endfunc
+
+.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+ ldr r12, [sp] // height
+1: subs r12, #4
+ vld1.32 {\regs} , [r2], r3
+ vld1.32 {\regs2} , [r2], r3
+ vld1.32 {\regs3} , [r2], r3
+ vld1.32 {\regs4} , [r2], r3
+ vst1.32 {\regs} , [r0], r1
+ vst1.32 {\regs2} , [r0], r1
+ vst1.32 {\regs3} , [r0], r1
+ vst1.32 {\regs4} , [r0], r1
+ bne 1b
+ bx lr
+endfunc
+.endm
+
+.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+ push {r4-r5}
+ ldr r12, [sp, #8] // height
+1: subs r12, #2
+ mov r4, r2
+ vld1.32 {\regs} , [r2]!
+ vld1.32 {\regs2} , [r2]
+ add r2, r4, r3
+ mov r4, r2
+ vld1.32 {\regs3} , [r2]!
+ vld1.32 {\regs4} , [r2]
+ add r2, r4, r3
+ mov r5, r0
+ vst1.32 {\regs} , [r0]!
+ vst1.32 {\regs2} , [r0]
+ add r0, r5, r1
+ mov r5, r0
+ vst1.32 {\regs3} , [r0]!
+ vst1.32 {\regs4} , [r0]
+ add r0, r5, r1
+ bne 1b
+ pop {r4-r5}
+ bx lr
+endfunc
+.endm
+
+put_qpel_uw_pixels 4, d0[0], d0[1], d1[0], d1[1]
+put_qpel_uw_pixels 8, d0, d1, d2, d3
+put_qpel_uw_pixels_m 12, d0, d1[0], d2, d3[0]
+put_qpel_uw_pixels 16, q0, q1, q2, q3
+put_qpel_uw_pixels 24, d0-d2, d3-d5, d16-d18, d19-d21
+put_qpel_uw_pixels 32, q0-q1, q2-q3, q8-q9, q10-q11
+put_qpel_uw_pixels_m 48, q0-q1, q2, q8-q9, q10
+put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11
diff --git a/libavcodec/arm/hpeldsp_arm.S b/libavcodec/arm/hpeldsp_arm.S
index 0f8092e15e..219f793d99 100644
--- a/libavcodec/arm/hpeldsp_arm.S
+++ b/libavcodec/arm/hpeldsp_arm.S
@@ -2,20 +2,20 @@
@ ARMv4-optimized halfpel functions
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
-@ This file is part of Libav.
+@ This file is part of FFmpeg.
@
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2.1 of the License, or (at your option) any later version.
@
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@
diff --git a/libavcodec/arm/hpeldsp_arm.h b/libavcodec/arm/hpeldsp_arm.h
index a8641529d5..5f3c7741c1 100644
--- a/libavcodec/arm/hpeldsp_arm.h
+++ b/libavcodec/arm/hpeldsp_arm.h
@@ -1,18 +1,20 @@
/*
- * This file is part of Libav.
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/hpeldsp_armv6.S b/libavcodec/arm/hpeldsp_armv6.S
index f1abc328eb..a8bd459c20 100644
--- a/libavcodec/arm/hpeldsp_armv6.S
+++ b/libavcodec/arm/hpeldsp_armv6.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/hpeldsp_init_arm.c b/libavcodec/arm/hpeldsp_init_arm.c
index 63906606a2..1977b1379b 100644
--- a/libavcodec/arm/hpeldsp_init_arm.c
+++ b/libavcodec/arm/hpeldsp_init_arm.c
@@ -2,20 +2,20 @@
* ARM-optimized halfpel functions
* Copyright (c) 2001 Lionel Ulmer
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/hpeldsp_init_armv6.c b/libavcodec/arm/hpeldsp_init_armv6.c
index 67a500d513..967a8e0427 100644
--- a/libavcodec/arm/hpeldsp_init_armv6.c
+++ b/libavcodec/arm/hpeldsp_init_armv6.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/hpeldsp_init_neon.c b/libavcodec/arm/hpeldsp_init_neon.c
index 76d4eafceb..d9feadd1dd 100644
--- a/libavcodec/arm/hpeldsp_init_neon.c
+++ b/libavcodec/arm/hpeldsp_init_neon.c
@@ -2,20 +2,20 @@
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/hpeldsp_neon.S b/libavcodec/arm/hpeldsp_neon.S
index 90bc3cb8ae..cf4a6cfb8d 100644
--- a/libavcodec/arm/hpeldsp_neon.S
+++ b/libavcodec/arm/hpeldsp_neon.S
@@ -2,20 +2,20 @@
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/idct.h b/libavcodec/arm/idct.h
index 168d64b666..39cef3a874 100644
--- a/libavcodec/arm/idct.h
+++ b/libavcodec/arm/idct.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/idctdsp_arm.S b/libavcodec/arm/idctdsp_arm.S
index 34f467e86f..057eff9be8 100644
--- a/libavcodec/arm/idctdsp_arm.S
+++ b/libavcodec/arm/idctdsp_arm.S
@@ -2,27 +2,27 @@
@ ARMv4-optimized IDCT functions
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
-@ This file is part of Libav.
+@ This file is part of FFmpeg.
@
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2.1 of the License, or (at your option) any later version.
@
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@
#include "config.h"
#include "libavutil/arm/asm.S"
-@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
+@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, ptrdiff_t stride)
function ff_add_pixels_clamped_arm, export=1, align=5
push {r4-r10}
mov r10, #8
diff --git a/libavcodec/arm/idctdsp_arm.h b/libavcodec/arm/idctdsp_arm.h
index 9012b82904..d7bc5cd02a 100644
--- a/libavcodec/arm/idctdsp_arm.h
+++ b/libavcodec/arm/idctdsp_arm.h
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/idctdsp_armv6.S b/libavcodec/arm/idctdsp_armv6.S
index c180d732fa..a6e77d6da1 100644
--- a/libavcodec/arm/idctdsp_armv6.S
+++ b/libavcodec/arm/idctdsp_armv6.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/idctdsp_init_arm.c b/libavcodec/arm/idctdsp_init_arm.c
index 8207c31589..0068e3f86c 100644
--- a/libavcodec/arm/idctdsp_init_arm.c
+++ b/libavcodec/arm/idctdsp_init_arm.c
@@ -2,20 +2,20 @@
* ARM-optimized IDCT functions
* Copyright (c) 2001 Lionel Ulmer
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -30,7 +30,7 @@
#include "idctdsp_arm.h"
void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
- int line_size);
+ ptrdiff_t line_size);
/* XXX: those functions should be suppressed ASAP when all IDCTs are
* converted */
@@ -63,8 +63,8 @@ av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
{
int cpu_flags = av_get_cpu_flags();
- if (!high_bit_depth) {
- if (avctx->idct_algo == FF_IDCT_AUTO ||
+ if (!avctx->lowres && !high_bit_depth) {
+ if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
avctx->idct_algo == FF_IDCT_ARM) {
c->idct_put = j_rev_dct_arm_put;
c->idct_add = j_rev_dct_arm_add;
diff --git a/libavcodec/arm/idctdsp_init_armv5te.c b/libavcodec/arm/idctdsp_init_armv5te.c
index 251165dd74..3d881e1f18 100644
--- a/libavcodec/arm/idctdsp_init_armv5te.c
+++ b/libavcodec/arm/idctdsp_init_armv5te.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -29,8 +29,9 @@
av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
- if (!high_bit_depth &&
+ if (!avctx->lowres && !high_bit_depth &&
(avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
c->idct_put = ff_simple_idct_put_armv5te;
c->idct_add = ff_simple_idct_add_armv5te;
diff --git a/libavcodec/arm/idctdsp_init_armv6.c b/libavcodec/arm/idctdsp_init_armv6.c
index 8f0c49b142..edf3070e15 100644
--- a/libavcodec/arm/idctdsp_init_armv6.c
+++ b/libavcodec/arm/idctdsp_init_armv6.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -27,13 +27,13 @@
#include "idctdsp_arm.h"
void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels,
- int line_size);
+ ptrdiff_t line_size);
av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
- if (!high_bit_depth) {
- if (avctx->idct_algo == FF_IDCT_AUTO ||
+ if (!avctx->lowres && !high_bit_depth) {
+ if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
avctx->idct_algo == FF_IDCT_SIMPLEARMV6) {
c->idct_put = ff_simple_idct_put_armv6;
c->idct_add = ff_simple_idct_add_armv6;
diff --git a/libavcodec/arm/idctdsp_init_neon.c b/libavcodec/arm/idctdsp_init_neon.c
index c94f7b6e5d..b70c5b0d44 100644
--- a/libavcodec/arm/idctdsp_init_neon.c
+++ b/libavcodec/arm/idctdsp_init_neon.c
@@ -2,20 +2,20 @@
* ARM-NEON-optimized IDCT functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -27,15 +27,16 @@
#include "idct.h"
#include "idctdsp_arm.h"
-void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
- if (!high_bit_depth) {
+ if (!avctx->lowres && !high_bit_depth) {
if (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLENEON) {
c->idct_put = ff_simple_idct_put_neon;
c->idct_add = ff_simple_idct_add_neon;
diff --git a/libavcodec/arm/idctdsp_neon.S b/libavcodec/arm/idctdsp_neon.S
index 7095879bae..1911a33468 100644
--- a/libavcodec/arm/idctdsp_neon.S
+++ b/libavcodec/arm/idctdsp_neon.S
@@ -2,20 +2,20 @@
* ARM-NEON-optimized IDCT functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S
index 42f37392e1..72c4c77c45 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -1,21 +1,21 @@
/*
* ARM NEON optimised integer operations
- * Copyright (c) 2009 Kostya Shishkov
+ * Copyright (c) 2009 Konstantin Shishkov
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -35,7 +35,7 @@ function ff_scalarproduct_int16_neon, export=1
vmlal.s16 q2, d18, d22
vmlal.s16 q3, d19, d23
subs r2, r2, #16
- bne 1b
+ bgt 1b
vpadd.s32 d16, d0, d1
vpadd.s32 d17, d2, d3
@@ -48,3 +48,4 @@ function ff_scalarproduct_int16_neon, export=1
vmov.32 r0, d3[0]
bx lr
endfunc
+
diff --git a/libavcodec/arm/apedsp_init_arm.c b/libavcodec/arm/lossless_audiodsp_init_arm.c
index 47ea034359..981a39aff9 100644
--- a/libavcodec/arm/apedsp_init_arm.c
+++ b/libavcodec/arm/lossless_audiodsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -23,12 +23,12 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
const int16_t *v3, int len, int mul);
-av_cold void ff_apedsp_init_arm(APEDSPContext *c)
+av_cold void ff_llauddsp_init_arm(LLAudDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/arm/apedsp_neon.S b/libavcodec/arm/lossless_audiodsp_neon.S
index 7cfbf43c6d..ba7c45fcef 100644
--- a/libavcodec/arm/apedsp_neon.S
+++ b/libavcodec/arm/lossless_audiodsp_neon.S
@@ -2,20 +2,20 @@
* ARM NEON optimised integer operations
* Copyright (c) 2009 Kostya Shishkov
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -47,7 +47,7 @@ function ff_scalarproduct_and_madd_int16_neon, export=1
vst1.16 {q10}, [r12,:128]!
subs r3, r3, #16
vst1.16 {q13}, [r12,:128]!
- bne 1b
+ bgt 1b
vpadd.s32 d16, d0, d1
vpadd.s32 d17, d2, d3
diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h
index 45ac67d436..dc57c5571c 100644
--- a/libavcodec/arm/mathops.h
+++ b/libavcodec/arm/mathops.h
@@ -2,20 +2,20 @@
* simple math operations
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mdct_fixed_init_arm.c b/libavcodec/arm/mdct_fixed_init_arm.c
deleted file mode 100644
index 606c80cbf4..0000000000
--- a/libavcodec/arm/mdct_fixed_init_arm.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-
-#define FFT_FLOAT 0
-#include "libavcodec/fft.h"
-
-void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
-void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
-
-av_cold void ff_mdct_fixed_init_arm(FFTContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- if (!s->inverse && s->nbits >= 3) {
- s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
- s->mdct_calc = ff_mdct_fixed_calc_neon;
- s->mdct_calcw = ff_mdct_fixed_calcw_neon;
- }
- }
-}
diff --git a/libavcodec/arm/mdct_fixed_neon.S b/libavcodec/arm/mdct_fixed_neon.S
index c77be59c65..365c5e7faf 100644
--- a/libavcodec/arm/mdct_fixed_neon.S
+++ b/libavcodec/arm/mdct_fixed_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mdct_init_arm.c b/libavcodec/arm/mdct_init_arm.c
deleted file mode 100644
index 24678dd8d0..0000000000
--- a/libavcodec/arm/mdct_init_arm.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-av_cold void ff_mdct_init_arm(FFTContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_vfp_vm(cpu_flags)) {
- s->imdct_half = ff_imdct_half_vfp;
- }
-
- if (have_neon(cpu_flags)) {
- s->imdct_calc = ff_imdct_calc_neon;
- s->imdct_half = ff_imdct_half_neon;
- s->mdct_calc = ff_mdct_calc_neon;
- s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
- }
-}
diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
index bfe259c396..a6952fa571 100644
--- a/libavcodec/arm/mdct_neon.S
+++ b/libavcodec/arm/mdct_neon.S
@@ -2,20 +2,20 @@
* ARM NEON optimised MDCT
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
index f3fe668eae..43f6d14c0c 100644
--- a/libavcodec/arm/mdct_vfp.S
+++ b/libavcodec/arm/mdct_vfp.S
@@ -2,20 +2,20 @@
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/me_cmp_armv6.S b/libavcodec/arm/me_cmp_armv6.S
index 436e20dd25..fa5a82301e 100644
--- a/libavcodec/arm/me_cmp_armv6.S
+++ b/libavcodec/arm/me_cmp_armv6.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/me_cmp_init_arm.c b/libavcodec/arm/me_cmp_init_arm.c
index 4d73f3e0fd..03870a2bfa 100644
--- a/libavcodec/arm/me_cmp_init_arm.c
+++ b/libavcodec/arm/me_cmp_init_arm.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mlpdsp_armv5te.S b/libavcodec/arm/mlpdsp_armv5te.S
index 4272dae029..4f9aa485fd 100644
--- a/libavcodec/arm/mlpdsp_armv5te.S
+++ b/libavcodec/arm/mlpdsp_armv5te.S
@@ -2,20 +2,20 @@
* Copyright (c) 2014 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S
index de9db466a5..b7ecf6cfae 100644
--- a/libavcodec/arm/mlpdsp_armv6.S
+++ b/libavcodec/arm/mlpdsp_armv6.S
@@ -2,20 +2,20 @@
* Copyright (c) 2014 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index 4cdd10caf5..34a5f61e1d 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -2,20 +2,20 @@
* Copyright (c) 2014 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
index 49bd0bcaf2..977abb6939 100644
--- a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
+++ b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mpegaudiodsp_init_arm.c b/libavcodec/arm/mpegaudiodsp_init_arm.c
index e73aee6a2b..98e0c8a3a8 100644
--- a/libavcodec/arm/mpegaudiodsp_init_arm.c
+++ b/libavcodec/arm/mpegaudiodsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Mans Rullgard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index 34e9cf18b5..918be16d03 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2002 Michael Niedermayer
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mpegvideo_arm.h b/libavcodec/arm/mpegvideo_arm.h
index 17e3a5b024..709ae6b247 100644
--- a/libavcodec/arm/mpegvideo_arm.h
+++ b/libavcodec/arm/mpegvideo_arm.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mpegvideo_armv5te.c b/libavcodec/arm/mpegvideo_armv5te.c
index 4bb7b6e025..e20bb4c645 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -2,24 +2,25 @@
* Optimization of some functions from mpegvideo.c for armv5te
* Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
#include "mpegvideo_arm.h"
@@ -55,7 +56,7 @@ static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
int level, qmul, qadd;
int nCoeffs;
- assert(s->block_last_index[n]>=0);
+ av_assert2(s->block_last_index[n]>=0);
qmul = qscale << 1;
@@ -84,7 +85,7 @@ static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
int qmul, qadd;
int nCoeffs;
- assert(s->block_last_index[n]>=0);
+ av_assert2(s->block_last_index[n]>=0);
qadd = (qscale - 1) | 1;
qmul = qscale << 1;
diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S
index 4426e15e91..8687d6b31c 100644
--- a/libavcodec/arm/mpegvideo_armv5te_s.S
+++ b/libavcodec/arm/mpegvideo_armv5te_s.S
@@ -2,20 +2,20 @@
* Optimization of some functions from mpegvideo.c for armv5te
* Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index 3e1f7b53e2..1889d7a912 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Mans Rullgard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mpegvideoencdsp_armv6.S b/libavcodec/arm/mpegvideoencdsp_armv6.S
index 99db501b25..ab0dad7b18 100644
--- a/libavcodec/arm/mpegvideoencdsp_armv6.S
+++ b/libavcodec/arm/mpegvideoencdsp_armv6.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/mpegvideoencdsp_init_arm.c b/libavcodec/arm/mpegvideoencdsp_init_arm.c
index ab9ba3e1be..4bfe835684 100644
--- a/libavcodec/arm/mpegvideoencdsp_init_arm.c
+++ b/libavcodec/arm/mpegvideoencdsp_init_arm.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/neon.S b/libavcodec/arm/neon.S
index 716a607af7..787bc4bf36 100644
--- a/libavcodec/arm/neon.S
+++ b/libavcodec/arm/neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/neontest.c b/libavcodec/arm/neontest.c
index 692576ee45..56f950abe0 100644
--- a/libavcodec/arm/neontest.c
+++ b/libavcodec/arm/neontest.c
@@ -2,20 +2,20 @@
* check NEON registers for clobbers
* Copyright (c) 2013 Martin Storsjo
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -77,3 +77,23 @@ wrap(avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt,
{
testneonclobbers(avcodec_encode_video2, avctx, avpkt, frame, got_packet_ptr);
}
+
+wrap(avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt))
+{
+ testneonclobbers(avcodec_send_packet, avctx, avpkt);
+}
+
+wrap(avcodec_receive_frame(AVCodecContext *avctx, AVFrame *frame))
+{
+ testneonclobbers(avcodec_receive_frame, avctx, frame);
+}
+
+wrap(avcodec_send_frame(AVCodecContext *avctx, const AVFrame *frame))
+{
+ testneonclobbers(avcodec_send_frame, avctx, frame);
+}
+
+wrap(avcodec_receive_packet(AVCodecContext *avctx, AVPacket *avpkt))
+{
+ testneonclobbers(avcodec_receive_packet, avctx, avpkt);
+}
diff --git a/libavcodec/arm/pixblockdsp_armv6.S b/libavcodec/arm/pixblockdsp_armv6.S
index 4c925a4daa..b10ea78e88 100644
--- a/libavcodec/arm/pixblockdsp_armv6.S
+++ b/libavcodec/arm/pixblockdsp_armv6.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/pixblockdsp_init_arm.c b/libavcodec/arm/pixblockdsp_init_arm.c
index bb32631df4..59d2b49381 100644
--- a/libavcodec/arm/pixblockdsp_init_arm.c
+++ b/libavcodec/arm/pixblockdsp_init_arm.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/rdft_init_arm.c b/libavcodec/arm/rdft_init_arm.c
index 2858ba93e8..1c5d8beb61 100644
--- a/libavcodec/arm/rdft_init_arm.c
+++ b/libavcodec/arm/rdft_init_arm.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/rdft_neon.S b/libavcodec/arm/rdft_neon.S
index 7d01d53f1a..781d976354 100644
--- a/libavcodec/arm/rdft_neon.S
+++ b/libavcodec/arm/rdft_neon.S
@@ -2,20 +2,20 @@
* ARM NEON optimised RDFT
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/rv34dsp_init_arm.c b/libavcodec/arm/rv34dsp_init_arm.c
index 5ce787ba7f..8bfe90b3d3 100644
--- a/libavcodec/arm/rv34dsp_init_arm.c
+++ b/libavcodec/arm/rv34dsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index a29123f772..3d4a83d9ac 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/rv40dsp_init_arm.c b/libavcodec/arm/rv40dsp_init_arm.c
index df3e4611a1..c24854d1cd 100644
--- a/libavcodec/arm/rv40dsp_init_arm.c
+++ b/libavcodec/arm/rv40dsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index 6bd45eb5ad..099f88c092 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -2,20 +2,20 @@
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/sbrdsp_init_arm.c b/libavcodec/arm/sbrdsp_init_arm.c
index 4da7967b49..4fb69f922b 100644
--- a/libavcodec/arm/sbrdsp_init_arm.c
+++ b/libavcodec/arm/sbrdsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/sbrdsp_neon.S b/libavcodec/arm/sbrdsp_neon.S
index 610397f9e2..e66abd682a 100644
--- a/libavcodec/arm/sbrdsp_neon.S
+++ b/libavcodec/arm/sbrdsp_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S
index a651927515..42d79ab95e 100644
--- a/libavcodec/arm/simple_idct_arm.S
+++ b/libavcodec/arm/simple_idct_arm.S
@@ -4,22 +4,22 @@
* Author: Frederic Boulay <dilb@handhelds.org>
*
* The function defined in this file is derived from the simple_idct function
- * from the libavcodec library part of the Libav project.
+ * from the libavcodec library part of the FFmpeg project.
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S
index b19683320a..a8d03469ab 100644
--- a/libavcodec/arm/simple_idct_armv5te.S
+++ b/libavcodec/arm/simple_idct_armv5te.S
@@ -4,20 +4,20 @@
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
* Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S
index 60723467a0..79cf5d41fb 100644
--- a/libavcodec/arm/simple_idct_armv6.S
+++ b/libavcodec/arm/simple_idct_armv6.S
@@ -4,20 +4,20 @@
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
* Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
index a1cde8d80a..c3e573c00a 100644
--- a/libavcodec/arm/simple_idct_neon.S
+++ b/libavcodec/arm/simple_idct_neon.S
@@ -6,20 +6,20 @@
* Based on Simple IDCT
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/startcode.h b/libavcodec/arm/startcode.h
index d7996c1a4b..cf25d9d4df 100644
--- a/libavcodec/arm/startcode.h
+++ b/libavcodec/arm/startcode.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/startcode_armv6.S b/libavcodec/arm/startcode_armv6.S
index 64078b2898..a46f009375 100644
--- a/libavcodec/arm/startcode_armv6.S
+++ b/libavcodec/arm/startcode_armv6.S
@@ -2,20 +2,20 @@
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/synth_filter_init_arm.c
index bf0d9b4b17..ea0ce148d4 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/synth_filter_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -22,20 +22,9 @@
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
-#include "libavcodec/dcadsp.h"
-
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
-void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs);
-
-void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
- SynthFilterContext *synth, FFTContext *imdct,
- float synth_buf_ptr[512],
- int *synth_buf_offset, float synth_buf2[32],
- const float window[512], float *samples_out,
- float raXin[32], float scale);
+#include "libavutil/internal.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
void ff_synth_filter_float_vfp(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
@@ -49,21 +38,6 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
float out[32], const float in[32],
float scale);
-av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_vfp_vm(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir32_vfp;
- s->lfe_fir[1] = ff_dca_lfe_fir64_vfp;
- s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
- }
- if (have_neon(cpu_flags)) {
- s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
- s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
- }
-}
-
av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
{
int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/arm/synth_filter_neon.S b/libavcodec/arm/synth_filter_neon.S
index 62bb6674ed..5417be7d53 100644
--- a/libavcodec/arm/synth_filter_neon.S
+++ b/libavcodec/arm/synth_filter_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
index 5d79e509f9..596734c5bc 100644
--- a/libavcodec/arm/synth_filter_vfp.S
+++ b/libavcodec/arm/synth_filter_vfp.S
@@ -2,20 +2,20 @@
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vc1dsp.h b/libavcodec/arm/vc1dsp.h
index 30f059f28c..cd01ac5384 100644
--- a/libavcodec/arm/vc1dsp.h
+++ b/libavcodec/arm/vc1dsp.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c
index a6a97c8bf9..5f2c759048 100644
--- a/libavcodec/arm/vc1dsp_init_arm.c
+++ b/libavcodec/arm/vc1dsp_init_arm.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -28,8 +28,10 @@ av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
+#if HAVE_ARMV6
if (have_setend(cpu_flags))
dsp->startcode_find_candidate = ff_startcode_find_candidate_armv6;
+#endif
if (have_neon(cpu_flags))
ff_vc1dsp_init_neon(dsp);
}
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index 9ded7a28b9..bb873e687e 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -37,40 +37,38 @@ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, int linesize, int16_t *block);
void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int rnd);
-void ff_put_vc1_mspel_mc10_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc20_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc30_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc01_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc02_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc03_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc11_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc12_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc13_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc21_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc22_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc23_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc31_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc32_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc33_neon(uint8_t *dst, const uint8_t *src,
- ptrdiff_t stride, int rnd);
+#define DECL_PUT(X, Y) \
+void ff_put_vc1_mspel_mc##X##Y##_neon(uint8_t *dst, const uint8_t *src, \
+ ptrdiff_t stride, int rnd); \
+static void ff_put_vc1_mspel_mc##X##Y##_16_neon(uint8_t *dst, const uint8_t *src, \
+ ptrdiff_t stride, int rnd) \
+{ \
+ ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+ ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+ dst += 8*stride; src += 8*stride; \
+ ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+ ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+}
+
+DECL_PUT(1, 0)
+DECL_PUT(2, 0)
+DECL_PUT(3, 0)
+
+DECL_PUT(0, 1)
+DECL_PUT(0, 2)
+DECL_PUT(0, 3)
+
+DECL_PUT(1, 1)
+DECL_PUT(1, 2)
+DECL_PUT(1, 3)
+
+DECL_PUT(2, 1)
+DECL_PUT(2, 2)
+DECL_PUT(2, 3)
+
+DECL_PUT(3, 1)
+DECL_PUT(3, 2)
+DECL_PUT(3, 3)
void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
int x, int y);
@@ -81,6 +79,10 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
int x, int y);
+#define FN_ASSIGN(X, Y) \
+ dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+ dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+
av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
{
dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
@@ -92,23 +94,26 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
- dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_neon;
+ dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
if (HAVE_AS_DN_DIRECTIVE) {
- dsp->put_vc1_mspel_pixels_tab[ 1] = ff_put_vc1_mspel_mc10_neon;
- dsp->put_vc1_mspel_pixels_tab[ 2] = ff_put_vc1_mspel_mc20_neon;
- dsp->put_vc1_mspel_pixels_tab[ 3] = ff_put_vc1_mspel_mc30_neon;
- dsp->put_vc1_mspel_pixels_tab[ 4] = ff_put_vc1_mspel_mc01_neon;
- dsp->put_vc1_mspel_pixels_tab[ 5] = ff_put_vc1_mspel_mc11_neon;
- dsp->put_vc1_mspel_pixels_tab[ 6] = ff_put_vc1_mspel_mc21_neon;
- dsp->put_vc1_mspel_pixels_tab[ 7] = ff_put_vc1_mspel_mc31_neon;
- dsp->put_vc1_mspel_pixels_tab[ 8] = ff_put_vc1_mspel_mc02_neon;
- dsp->put_vc1_mspel_pixels_tab[ 9] = ff_put_vc1_mspel_mc12_neon;
- dsp->put_vc1_mspel_pixels_tab[10] = ff_put_vc1_mspel_mc22_neon;
- dsp->put_vc1_mspel_pixels_tab[11] = ff_put_vc1_mspel_mc32_neon;
- dsp->put_vc1_mspel_pixels_tab[12] = ff_put_vc1_mspel_mc03_neon;
- dsp->put_vc1_mspel_pixels_tab[13] = ff_put_vc1_mspel_mc13_neon;
- dsp->put_vc1_mspel_pixels_tab[14] = ff_put_vc1_mspel_mc23_neon;
- dsp->put_vc1_mspel_pixels_tab[15] = ff_put_vc1_mspel_mc33_neon;
+ FN_ASSIGN(1, 0);
+ FN_ASSIGN(2, 0);
+ FN_ASSIGN(3, 0);
+
+ FN_ASSIGN(0, 1);
+ FN_ASSIGN(1, 1);
+ FN_ASSIGN(2, 1);
+ FN_ASSIGN(3, 1);
+
+ FN_ASSIGN(0, 2);
+ FN_ASSIGN(1, 2);
+ FN_ASSIGN(2, 2);
+ FN_ASSIGN(3, 2);
+
+ FN_ASSIGN(0, 3);
+ FN_ASSIGN(1, 3);
+ FN_ASSIGN(2, 3);
+ FN_ASSIGN(3, 3);
}
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index fa87eded61..c4f4db9c8e 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -4,20 +4,20 @@
* Copyright (c) 2010 Rob Clark <rob@ti.com>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/videodsp_arm.h b/libavcodec/arm/videodsp_arm.h
index a7087599cc..112cbb86c7 100644
--- a/libavcodec/arm/videodsp_arm.h
+++ b/libavcodec/arm/videodsp_arm.h
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/videodsp_armv5te.S b/libavcodec/arm/videodsp_armv5te.S
index 0510019f03..aff1161ada 100644
--- a/libavcodec/arm/videodsp_armv5te.S
+++ b/libavcodec/arm/videodsp_armv5te.S
@@ -2,20 +2,20 @@
@ ARMv5te-optimized core video DSP functions
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
-@ This file is part of Libav.
+@ This file is part of FFmpeg
@
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2.1 of the License, or (at your option) any later version.
@
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@
diff --git a/libavcodec/arm/videodsp_init_arm.c b/libavcodec/arm/videodsp_init_arm.c
index 20c6e4a605..a89abb25d5 100644
--- a/libavcodec/arm/videodsp_init_arm.c
+++ b/libavcodec/arm/videodsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (C) 2012 Ronald S. Bultje
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/videodsp_init_armv5te.c b/libavcodec/arm/videodsp_init_armv5te.c
index 832191f6d2..1ea1f3438d 100644
--- a/libavcodec/arm/videodsp_init_armv5te.c
+++ b/libavcodec/arm/videodsp_init_armv5te.c
@@ -1,20 +1,20 @@
/*
* Copyright (C) 2012 Ronald S. Bultje
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -27,5 +27,7 @@ void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h);
av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc)
{
+#if HAVE_ARMV5TE_EXTERNAL
ctx->prefetch = ff_prefetch_arm;
+#endif
}
diff --git a/libavcodec/arm/vorbisdsp_init_arm.c b/libavcodec/arm/vorbisdsp_init_arm.c
index 853ba2d865..f4b3d80ef6 100644
--- a/libavcodec/arm/vorbisdsp_init_arm.c
+++ b/libavcodec/arm/vorbisdsp_init_arm.c
@@ -2,20 +2,20 @@
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vorbisdsp_neon.S b/libavcodec/arm/vorbisdsp_neon.S
index 7df876c2bc..79ce54f938 100644
--- a/libavcodec/arm/vorbisdsp_neon.S
+++ b/libavcodec/arm/vorbisdsp_neon.S
@@ -2,20 +2,20 @@
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp3dsp_init_arm.c b/libavcodec/arm/vp3dsp_init_arm.c
index 1c914343d3..65ea53fe0f 100644
--- a/libavcodec/arm/vp3dsp_init_arm.c
+++ b/libavcodec/arm/vp3dsp_init_arm.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
index 58bd97d548..2942d488f5 100644
--- a/libavcodec/arm/vp3dsp_neon.S
+++ b/libavcodec/arm/vp3dsp_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2009 David Conrad
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h
index 6bc9456336..feb1247916 100644
--- a/libavcodec/arm/vp56_arith.h
+++ b/libavcodec/arm/vp56_arith.h
@@ -1,20 +1,20 @@
/*
* Copyright (C) 2010 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp6dsp_init_arm.c b/libavcodec/arm/vp6dsp_init_arm.c
index 7e2615047b..a59d61278c 100644
--- a/libavcodec/arm/vp6dsp_init_arm.c
+++ b/libavcodec/arm/vp6dsp_init_arm.c
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp6dsp_neon.S b/libavcodec/arm/vp6dsp_neon.S
index 10b4d0f14c..03dd28d1cb 100644
--- a/libavcodec/arm/vp6dsp_neon.S
+++ b/libavcodec/arm/vp6dsp_neon.S
@@ -1,20 +1,20 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp8.h b/libavcodec/arm/vp8.h
index 93b2788835..965342d93b 100644
--- a/libavcodec/arm/vp8.h
+++ b/libavcodec/arm/vp8.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S
index 3863dc31a5..e7d25a45c1 100644
--- a/libavcodec/arm/vp8_armv6.S
+++ b/libavcodec/arm/vp8_armv6.S
@@ -1,20 +1,20 @@
/*
* Copyright (C) 2010 Mans Rullgard
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp8dsp.h b/libavcodec/arm/vp8dsp.h
index 0d55e0ffc0..7281d0bfb1 100644
--- a/libavcodec/arm/vp8dsp.h
+++ b/libavcodec/arm/vp8dsp.h
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp8dsp_armv6.S b/libavcodec/arm/vp8dsp_armv6.S
index 9eb9734cd3..2320bf4d23 100644
--- a/libavcodec/arm/vp8dsp_armv6.S
+++ b/libavcodec/arm/vp8dsp_armv6.S
@@ -5,20 +5,20 @@
* Copyright (c) 2010 Rob Clark <rob@ti.com>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* This code was partially ported from libvpx, which uses this license:
diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c
index aa77dbab98..8b801766d7 100644
--- a/libavcodec/arm/vp8dsp_init_arm.c
+++ b/libavcodec/arm/vp8dsp_init_arm.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp8dsp_init_armv6.c b/libavcodec/arm/vp8dsp_init_armv6.c
index febe4e71a2..a5bcd733e0 100644
--- a/libavcodec/arm/vp8dsp_init_armv6.c
+++ b/libavcodec/arm/vp8dsp_init_armv6.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp8dsp_init_neon.c b/libavcodec/arm/vp8dsp_init_neon.c
index 2b6c7750d3..53f1f23380 100644
--- a/libavcodec/arm/vp8dsp_init_neon.c
+++ b/libavcodec/arm/vp8dsp_init_neon.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
index f43b4f7060..fcb424881b 100644
--- a/libavcodec/arm/vp8dsp_neon.S
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -4,20 +4,20 @@
* Copyright (c) 2010 Rob Clark <rob@ti.com>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavcodec/arm/vp9dsp_init.h b/libavcodec/arm/vp9dsp_init.h
new file mode 100644
index 0000000000..0dc1c2dc20
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_VP9DSP_INIT_H
+#define AVCODEC_ARM_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+void ff_vp9dsp_init_10bpp_arm(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_arm(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_ARM_VP9DSP_INIT_H */
diff --git a/libavcodec/arm/vp9dsp_init_10bpp_arm.c b/libavcodec/arm/vp9dsp_init_10bpp_arm.c
new file mode 100644
index 0000000000..b8cb293b20
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_10bpp_arm.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_arm
+#include "vp9dsp_init_16bpp_arm_template.c"
diff --git a/libavcodec/arm/vp9dsp_init_12bpp_arm.c b/libavcodec/arm/vp9dsp_init_12bpp_arm.c
new file mode 100644
index 0000000000..fa65eb260b
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_12bpp_arm.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_arm
+#include "vp9dsp_init_16bpp_arm_template.c"
diff --git a/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c b/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c
new file mode 100644
index 0000000000..3620535065
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz, suffix) \
+void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define decl_mc_func(op, filter, dir, sz, bpp) \
+void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz, bpp) \
+static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \
+ /* We only need h + 7 lines, but the horizontal filter assumes an \
+ * even number of rows, so filter h + 8 lines here. */ \
+ ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \
+ src - 3 * src_stride, src_stride, \
+ h + 8, mx, 0); \
+ ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \
+ temp + 3 * 2 * sz, 2 * sz, \
+ h, 0, my); \
+}
+
+#define decl_filter_funcs(op, dir, sz, bpp) \
+ decl_mc_func(op, regular, dir, sz, bpp); \
+ decl_mc_func(op, sharp, dir, sz, bpp); \
+ decl_mc_func(op, smooth, dir, sz, bpp)
+
+#define decl_mc_funcs(sz, bpp) \
+ decl_filter_funcs(put, h, sz, bpp); \
+ decl_filter_funcs(avg, h, sz, bpp); \
+ decl_filter_funcs(put, v, sz, bpp); \
+ decl_filter_funcs(avg, v, sz, bpp); \
+ decl_filter_funcs(put, hv, sz, bpp); \
+ decl_filter_funcs(avg, hv, sz, bpp)
+
+declare_fpel(copy, 128, );
+declare_fpel(copy, 64, );
+declare_fpel(copy, 32, );
+declare_fpel(copy, 16, );
+declare_fpel(copy, 8, );
+declare_fpel(avg, 64, _16);
+declare_fpel(avg, 32, _16);
+declare_fpel(avg, 16, _16);
+declare_fpel(avg, 8, _16);
+declare_fpel(avg, 4, _16);
+
+decl_mc_funcs(64, BPP);
+decl_mc_funcs(32, BPP);
+decl_mc_funcs(16, BPP);
+decl_mc_funcs(8, BPP);
+decl_mc_funcs(4, BPP);
+
+#define define_8tap_2d_funcs(sz, bpp) \
+ define_8tap_2d_fn(put, regular, sz, bpp) \
+ define_8tap_2d_fn(put, sharp, sz, bpp) \
+ define_8tap_2d_fn(put, smooth, sz, bpp) \
+ define_8tap_2d_fn(avg, regular, sz, bpp) \
+ define_8tap_2d_fn(avg, sharp, sz, bpp) \
+ define_8tap_2d_fn(avg, smooth, sz, bpp)
+
+define_8tap_2d_funcs(64, BPP)
+define_8tap_2d_funcs(32, BPP)
+define_8tap_2d_funcs(16, BPP)
+define_8tap_2d_funcs(8, BPP)
+define_8tap_2d_funcs(4, BPP)
+
+
+static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_fpel(idx1, idx2, sz, type, suffix) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix##_neon
+
+#define init_copy_avg(idx, sz1, sz2) \
+ init_fpel(idx, 0, sz2, copy, ); \
+ init_fpel(idx, 1, sz1, avg, _16)
+
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
+ dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \
+ init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp)
+
+#define init_mc_funcs_dirs(idx, sz, bpp) \
+ init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \
+ init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \
+ init_mc_funcs(idx, hv, 1, 1, sz, , bpp)
+
+ init_copy_avg(0, 64, 128);
+ init_copy_avg(1, 32, 64);
+ init_copy_avg(2, 16, 32);
+ init_copy_avg(3, 8, 16);
+ init_copy_avg(4, 4, 8);
+
+ init_mc_funcs_dirs(0, 64, BPP);
+ init_mc_funcs_dirs(1, 32, BPP);
+ init_mc_funcs_dirs(2, 16, BPP);
+ init_mc_funcs_dirs(3, 8, BPP);
+ init_mc_funcs_dirs(4, 4, BPP);
+ }
+}
+
+#define define_itxfm2(type_a, type_b, sz, bpp) \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \
+ ptrdiff_t stride, \
+ int16_t *_block, int eob)
+#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
+
+#define define_itxfm_funcs(sz, bpp) \
+ define_itxfm(idct, idct, sz, bpp); \
+ define_itxfm(iadst, idct, sz, bpp); \
+ define_itxfm(idct, iadst, sz, bpp); \
+ define_itxfm(iadst, iadst, sz, bpp)
+
+define_itxfm_funcs(4, BPP);
+define_itxfm_funcs(8, BPP);
+define_itxfm_funcs(16, BPP);
+define_itxfm(idct, idct, 32, BPP);
+define_itxfm(iwht, iwht, 4, BPP);
+
+
+static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_itxfm2(tx, sz, bpp) \
+ dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \
+ dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
+ dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
+ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
+#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
+
+#define init_idct2(tx, nm, bpp) \
+ dsp->itxfm_add[tx][DCT_DCT] = \
+ dsp->itxfm_add[tx][ADST_DCT] = \
+ dsp->itxfm_add[tx][DCT_ADST] = \
+ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
+#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
+
+ init_itxfm(TX_4X4, 4x4, BPP);
+ init_itxfm(TX_8X8, 8x8, BPP);
+ init_itxfm(TX_16X16, 16x16, BPP);
+ init_idct(TX_32X32, idct_idct_32x32, BPP);
+ init_idct(4, iwht_iwht_4x4, BPP);
+ }
+}
+
+#define define_loop_filter(dir, wd, size, bpp) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size, bpp) \
+ define_loop_filter(h, wd, size, bpp); \
+ define_loop_filter(v, wd, size, bpp)
+
+define_loop_filters(4, 8, BPP);
+define_loop_filters(8, 8, BPP);
+define_loop_filters(16, 8, BPP);
+
+define_loop_filters(16, 16, BPP);
+
+define_loop_filters(44, 16, BPP);
+define_loop_filters(48, 16, BPP);
+define_loop_filters(84, 16, BPP);
+define_loop_filters(88, 16, BPP);
+
+static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
+ dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
+
+#define init_lpf_func_16(idx, dir, bpp) \
+ dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
+
+#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
+ dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
+
+#define init_lpf_funcs_8_wd(idx, wd, bpp) \
+ init_lpf_func_8(idx, 0, h, wd, bpp); \
+ init_lpf_func_8(idx, 1, v, wd, bpp)
+
+#define init_lpf_funcs_16(bpp) \
+ init_lpf_func_16(0, h, bpp); \
+ init_lpf_func_16(1, v, bpp)
+
+#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
+ init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \
+ init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
+
+#define init_lpf_funcs_8(bpp) \
+ init_lpf_funcs_8_wd(0, 4, bpp); \
+ init_lpf_funcs_8_wd(1, 8, bpp); \
+ init_lpf_funcs_8_wd(2, 16, bpp)
+
+#define init_lpf_funcs_mix2(bpp) \
+ init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
+ init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
+ init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
+ init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
+
+ init_lpf_funcs_8(BPP);
+ init_lpf_funcs_16(BPP);
+ init_lpf_funcs_mix2(BPP);
+ }
+}
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+ vp9dsp_mc_init_arm(dsp);
+ vp9dsp_loopfilter_init_arm(dsp);
+ vp9dsp_itxfm_init_arm(dsp);
+}
diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
new file mode 100644
index 0000000000..4c57fd6ba0
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz) \
+void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define declare_copy_avg(sz) \
+ declare_fpel(copy, sz); \
+ declare_fpel(avg , sz)
+
+#define decl_mc_func(op, filter, dir, sz) \
+void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz) \
+static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+ const uint8_t *src, ptrdiff_t src_stride, \
+ int h, int mx, int my) \
+{ \
+ LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]); \
+ /* We only need h + 7 lines, but the horizontal filter assumes an \
+ * even number of rows, so filter h + 8 lines here. */ \
+ ff_vp9_put_##filter##sz##_h_neon(temp, sz, \
+ src - 3 * src_stride, src_stride, \
+ h + 8, mx, 0); \
+ ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride, \
+ temp + 3 * sz, sz, \
+ h, 0, my); \
+}
+
+#define decl_filter_funcs(op, dir, sz) \
+ decl_mc_func(op, regular, dir, sz); \
+ decl_mc_func(op, sharp, dir, sz); \
+ decl_mc_func(op, smooth, dir, sz)
+
+#define decl_mc_funcs(sz) \
+ decl_filter_funcs(put, h, sz); \
+ decl_filter_funcs(avg, h, sz); \
+ decl_filter_funcs(put, v, sz); \
+ decl_filter_funcs(avg, v, sz); \
+ decl_filter_funcs(put, hv, sz); \
+ decl_filter_funcs(avg, hv, sz)
+
+declare_copy_avg(64);
+declare_copy_avg(32);
+declare_copy_avg(16);
+declare_copy_avg(8);
+declare_copy_avg(4);
+
+decl_mc_funcs(64);
+decl_mc_funcs(32);
+decl_mc_funcs(16);
+decl_mc_funcs(8);
+decl_mc_funcs(4);
+
+#define define_8tap_2d_funcs(sz) \
+ define_8tap_2d_fn(put, regular, sz) \
+ define_8tap_2d_fn(put, sharp, sz) \
+ define_8tap_2d_fn(put, smooth, sz) \
+ define_8tap_2d_fn(avg, regular, sz) \
+ define_8tap_2d_fn(avg, sharp, sz) \
+ define_8tap_2d_fn(avg, smooth, sz)
+
+define_8tap_2d_funcs(64)
+define_8tap_2d_funcs(32)
+define_8tap_2d_funcs(16)
+define_8tap_2d_funcs(8)
+define_8tap_2d_funcs(4)
+
+
+static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_fpel(idx1, idx2, sz, type) \
+ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
+ dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##_neon
+
+#define init_copy_avg(idx, sz) \
+ init_fpel(idx, 0, sz, copy); \
+ init_fpel(idx, 1, sz, avg)
+
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
+ dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \
+ init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
+ init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
+ init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx)
+
+#define init_mc_funcs_dirs(idx, sz) \
+ init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_); \
+ init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_); \
+ init_mc_funcs(idx, hv, 1, 1, sz,)
+
+ init_copy_avg(0, 64);
+ init_copy_avg(1, 32);
+ init_copy_avg(2, 16);
+ init_copy_avg(3, 8);
+ init_copy_avg(4, 4);
+
+ init_mc_funcs_dirs(0, 64);
+ init_mc_funcs_dirs(1, 32);
+ init_mc_funcs_dirs(2, 16);
+ init_mc_funcs_dirs(3, 8);
+ init_mc_funcs_dirs(4, 4);
+ }
+}
+
+#define define_itxfm(type_a, type_b, sz) \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst, \
+ ptrdiff_t stride, \
+ int16_t *_block, int eob)
+
+#define define_itxfm_funcs(sz) \
+ define_itxfm(idct, idct, sz); \
+ define_itxfm(iadst, idct, sz); \
+ define_itxfm(idct, iadst, sz); \
+ define_itxfm(iadst, iadst, sz)
+
+define_itxfm_funcs(4);
+define_itxfm_funcs(8);
+define_itxfm_funcs(16);
+define_itxfm(idct, idct, 32);
+define_itxfm(iwht, iwht, 4);
+
+
+static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#define init_itxfm(tx, sz) \
+ dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_neon; \
+ dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_neon; \
+ dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_neon; \
+ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
+
+#define init_idct(tx, nm) \
+ dsp->itxfm_add[tx][DCT_DCT] = \
+ dsp->itxfm_add[tx][ADST_DCT] = \
+ dsp->itxfm_add[tx][DCT_ADST] = \
+ dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
+
+ init_itxfm(TX_4X4, 4x4);
+ init_itxfm(TX_8X8, 8x8);
+ init_itxfm(TX_16X16, 16x16);
+ init_idct(TX_32X32, idct_idct_32x32);
+ init_idct(4, iwht_iwht_4x4);
+ }
+}
+
+#define define_loop_filter(dir, wd, size) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size) \
+ define_loop_filter(h, wd, size); \
+ define_loop_filter(v, wd, size)
+
+define_loop_filters(4, 8);
+define_loop_filters(8, 8);
+define_loop_filters(16, 8);
+define_loop_filters(16, 16);
+
+define_loop_filters(44, 16);
+
+#define lf_mix_fn(dir, wd1, wd2, stridea) \
+static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int E, int I, int H) \
+{ \
+ ff_vp9_loop_filter_##dir##_##wd1##_8_neon(dst, stride, E & 0xff, I & 0xff, H & 0xff); \
+ ff_vp9_loop_filter_##dir##_##wd2##_8_neon(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
+}
+
+#define lf_mix_fns(wd1, wd2) \
+ lf_mix_fn(h, wd1, wd2, stride) \
+ lf_mix_fn(v, wd1, wd2, sizeof(uint8_t))
+
+lf_mix_fns(4, 8)
+lf_mix_fns(8, 4)
+lf_mix_fns(8, 8)
+
+static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
+ dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
+ dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
+ dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
+ dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
+ dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
+
+ dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
+ dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
+
+ dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
+ dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
+ dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon;
+ dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon;
+ dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon;
+ dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_neon;
+ dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_neon;
+ dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_neon;
+ }
+}
+
+av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp)
+{
+ if (bpp == 10) {
+ ff_vp9dsp_init_10bpp_arm(dsp);
+ return;
+ } else if (bpp == 12) {
+ ff_vp9dsp_init_12bpp_arm(dsp);
+ return;
+ } else if (bpp != 8)
+ return;
+
+ vp9dsp_mc_init_arm(dsp);
+ vp9dsp_loopfilter_init_arm(dsp);
+ vp9dsp_itxfm_init_arm(dsp);
+}
diff --git a/libavcodec/arm/vp9itxfm_16bpp_neon.S b/libavcodec/arm/vp9itxfm_16bpp_neon.S
new file mode 100644
index 0000000000..b4f615ebb8
--- /dev/null
+++ b/libavcodec/arm/vp9itxfm_16bpp_neon.S
@@ -0,0 +1,1945 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+ .short 11585, 0, 6270, 15137
+iadst4_coeffs:
+ .short 5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+ .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+ .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+ .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+ .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+ .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+ .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+ .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+@ Do two 4x4 transposes, using q registers for the subtransposes that don't
+@ need to address the individual d registers.
+@ r0,r1 == rq1, r2,r3 == rq1, etc
+.macro transpose32_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ vswp \r1, \r4 @ vtrn.64 \rq0, \rq2
+ vswp \r3, \r6 @ vtrn.64 \rq1, \rq3
+ vswp \r9, \r12 @ vtrn.64 \rq4, \rq6
+ vswp \r11, \r14 @ vtrn.64 \rq5, \rq7
+ vtrn.32 \rq0, \rq1
+ vtrn.32 \rq2, \rq3
+ vtrn.32 \rq4, \rq5
+ vtrn.32 \rq6, \rq7
+.endm
+
+@ Do eight 2x2 transposes.
+.macro transpose32_8x_2x2 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ vtrn.32 \r0, \r1
+ vtrn.32 \r2, \r3
+ vtrn.32 \r4, \r5
+ vtrn.32 \r6, \r7
+ vtrn.32 \r8, \r9
+ vtrn.32 \r10, \r11
+ vtrn.32 \r12, \r13
+ vtrn.32 \r14, \r15
+.endm
+
+@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ in/out are d registers
+.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
+ vadd.s32 \tmpd1, \in1, \in2
+ vsub.s32 \tmpd2, \in1, \in2
+.if \neg > 0
+ vneg.s32 \tmpd1, \tmpd1
+.endif
+ vmull.s32 \tmpq3, \tmpd1, d0[0]
+ vmull.s32 \tmpq4, \tmpd2, d0[0]
+ vrshrn.s64 \out1, \tmpq3, #14
+ vrshrn.s64 \out2, \tmpq4, #14
+.endm
+
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+ vmull.s32 \tmpq3, \in1, d0[0]
+ vrshrn.s64 \out1, \tmpq3, #14
+ vrshrn.s64 \out2, \tmpq3, #14
+.endm
+
+@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ Same as mbutterfly0, but with input being 2 q registers, output
+@ being 4 d registers.
+@ This can do with either 4 or 6 temporary q registers.
+.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
+ vadd.s32 \tmpq1, \in1, \in2
+ vsub.s32 \tmpq2, \in1, \in2
+ vmull.s32 \tmpq3, \tmpd11, d0[0]
+ vmull.s32 \tmpq4, \tmpd12, d0[0]
+.ifb \tmpq5
+ vrshrn.s64 \out1, \tmpq3, #14
+ vrshrn.s64 \out2, \tmpq4, #14
+ vmull.s32 \tmpq3, \tmpd21, d0[0]
+ vmull.s32 \tmpq4, \tmpd22, d0[0]
+ vrshrn.s64 \out3, \tmpq3, #14
+ vrshrn.s64 \out4, \tmpq4, #14
+.else
+ vmull.s32 \tmpq5, \tmpd21, d0[0]
+ vmull.s32 \tmpq6, \tmpd22, d0[0]
+ vrshrn.s64 \out1, \tmpq3, #14
+ vrshrn.s64 \out2, \tmpq4, #14
+ vrshrn.s64 \out3, \tmpq5, #14
+ vrshrn.s64 \out4, \tmpq6, #14
+.endif
+.endm
+
+@ out1 = in1 * coef1 - in2 * coef2
+@ out2 = in1 * coef2 + in2 * coef1
+@ out are 2 q registers, in are 2 d registers
+.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2, neg=0
+ vmull.s32 \out1, \in1, \coef1
+ vmlsl.s32 \out1, \in2, \coef2
+.if \neg
+ vmov.s64 \out2, #0
+ vmlsl.s32 \out2, \in1, \coef2
+ vmlsl.s32 \out2, \in2, \coef1
+.else
+ vmull.s32 \out2, \in1, \coef2
+ vmlal.s32 \out2, \in2, \coef1
+.endif
+.endm
+
+@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
+@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
+@ out are 4 q registers, in are 4 d registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
+ vmull.s32 \out1, \in1, \coef1
+ vmull.s32 \out2, \in2, \coef1
+ vmull.s32 \out3, \in1, \coef2
+ vmull.s32 \out4, \in2, \coef2
+ vmlsl.s32 \out1, \in3, \coef2
+ vmlsl.s32 \out2, \in4, \coef2
+ vmlal.s32 \out3, \in3, \coef1
+ vmlal.s32 \out4, \in4, \coef1
+.endm
+
+@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+@ inout are 2 d registers, tmp are 2 q registers
+.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0
+ mbutterfly_l \tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2, \neg
+ vrshrn.s64 \inout1, \tmp1, #14
+ vrshrn.s64 \inout2, \tmp2, #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout2 as zero
+.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
+ vmull.s32 \tmp1, \inout1, \coef1
+ vmull.s32 \tmp2, \inout1, \coef2
+ vrshrn.s64 \inout1, \tmp1, #14
+ vrshrn.s64 \inout2, \tmp2, #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout1 as zero
+.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
+ vmov.s64 \tmp1, #0
+ vmull.s32 \tmp2, \inout2, \coef1
+ vmlsl.s32 \tmp1, \inout2, \coef2
+ vrshrn.s64 \inout2, \tmp2, #14
+ vrshrn.s64 \inout1, \tmp1, #14
+.endm
+
+@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
+@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
+@ inout are 4 d registers, tmp are 4 q registers
+.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
+ vrshrn.s64 \inout1, \tmp1, #14
+ vrshrn.s64 \inout2, \tmp2, #14
+ vrshrn.s64 \inout3, \tmp3, #14
+ vrshrn.s64 \inout4, \tmp4, #14
+.endm
+
+@ out1 = in1 + in2
+@ out2 = in1 - in2
+.macro butterfly out1, out2, in1, in2
+ vadd.s32 \out1, \in1, \in2
+ vsub.s32 \out2, \in1, \in2
+.endm
+
+@ out1 = in1 - in2
+@ out2 = in1 + in2
+.macro butterfly_r out1, out2, in1, in2
+ vsub.s32 \out1, \in1, \in2
+ vadd.s32 \out2, \in1, \in2
+.endm
+
+@ out1 = (in1 + in2 + (1 << 13)) >> 14
+@ out2 = (in1 - in2 + (1 << 13)) >> 14
+@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
+.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
+ vadd.s64 \tmp1, \in1, \in2
+ vsub.s64 \tmp2, \in1, \in2
+ vrshrn.s64 \out1, \tmp1, #14
+ vrshrn.s64 \out2, \tmp2, #14
+.endm
+
+@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
+.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+ vadd.s64 \tmp1, \in1, \in3
+ vadd.s64 \tmp2, \in2, \in4
+ vsub.s64 \tmp3, \in1, \in3
+ vsub.s64 \tmp4, \in2, \in4
+ vrshrn.s64 \out1, \tmp1, #14
+ vrshrn.s64 \out2, \tmp2, #14
+ vrshrn.s64 \out3, \tmp3, #14
+ vrshrn.s64 \out4, \tmp4, #14
+.endm
+
+
+.macro iwht4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+ vadd.i32 \c0, \c0, \c1
+ vsub.i32 q11, \c2, \c3
+ vsub.i32 q10, \c0, q11
+ vshr.s32 q10, q10, #1
+ vsub.i32 \c2, q10, \c1
+ vsub.i32 \c1, q10, \c3
+ vadd.i32 \c3, q11, \c2
+ vsub.i32 \c0, \c0, \c1
+.endm
+
+.macro iwht4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+ iwht4_10 \c0, \c1, \c2, \c3, \cd0, \cd1, \cd2, \cd3, \cd4, \cd5, \cd6, \cd7
+.endm
+
+@ c0 == cd0,cd1, c1 == cd2,cd3
+.macro idct4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+ vmul.s32 q13, \c1, d1[1]
+ vmul.s32 q11, \c1, d1[0]
+ vadd.i32 q14, \c0, \c2
+ vsub.i32 q15, \c0, \c2
+ vmla.s32 q13, \c3, d1[0]
+ vmul.s32 q12, q14, d0[0]
+ vmul.s32 q10, q15, d0[0]
+ vmls.s32 q11, \c3, d1[1]
+ vrshr.s32 q13, q13, #14
+ vrshr.s32 q12, q12, #14
+ vrshr.s32 q10, q10, #14
+ vrshr.s32 q11, q11, #14
+ vadd.i32 \c0, q12, q13
+ vsub.i32 \c3, q12, q13
+ vadd.i32 \c1, q10, q11
+ vsub.i32 \c2, q10, q11
+.endm
+
+.macro idct4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+ vmull.s32 q13, \cd2, d1[1]
+ vmull.s32 q15, \cd3, d1[1]
+ vmull.s32 q11, \cd2, d1[0]
+ vmull.s32 q3, \cd3, d1[0]
+ vadd.i32 q14, \c0, \c2
+ vsub.i32 q2, \c0, \c2
+ vmlal.s32 q13, \cd6, d1[0]
+ vmlal.s32 q15, \cd7, d1[0]
+ vmull.s32 q12, d28, d0[0]
+ vmull.s32 q14, d29, d0[0]
+ vmull.s32 q10, d4, d0[0]
+ vmull.s32 q8, d5, d0[0]
+ vmlsl.s32 q11, \cd6, d1[1]
+ vmlsl.s32 q3, \cd7, d1[1]
+ vrshrn.s64 d26, q13, #14
+ vrshrn.s64 d27, q15, #14
+ vrshrn.s64 d24, q12, #14
+ vrshrn.s64 d25, q14, #14
+ vrshrn.s64 d20, q10, #14
+ vrshrn.s64 d21, q8, #14
+ vrshrn.s64 d22, q11, #14
+ vrshrn.s64 d23, q3, #14
+ vadd.i32 \c0, q12, q13
+ vsub.i32 \c3, q12, q13
+ vadd.i32 \c1, q10, q11
+ vsub.i32 \c2, q10, q11
+.endm
+
+.macro iadst4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+ vmul.s32 q10, \c0, d2[0]
+ vmla.s32 q10, \c2, d2[1]
+ vmla.s32 q10, \c3, d3[0]
+ vmul.s32 q11, \c0, d3[0]
+ vmls.s32 q11, \c2, d2[0]
+ vsub.s32 \c0, \c0, \c2
+ vmls.s32 q11, \c3, d2[1]
+ vadd.s32 \c0, \c0, \c3
+ vmul.s32 q13, \c1, d3[1]
+ vmul.s32 q12, \c0, d3[1]
+ vadd.s32 q14, q10, q13
+ vadd.s32 q15, q11, q13
+ vrshr.s32 \c0, q14, #14
+ vadd.s32 q10, q10, q11
+ vrshr.s32 \c1, q15, #14
+ vsub.s32 q10, q10, q13
+ vrshr.s32 \c2, q12, #14
+ vrshr.s32 \c3, q10, #14
+.endm
+
+.macro iadst4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+ vmull.s32 q10, \cd0, d2[0]
+ vmull.s32 q4, \cd1, d2[0]
+ vmlal.s32 q10, \cd4, d2[1]
+ vmlal.s32 q4, \cd5, d2[1]
+ vmlal.s32 q10, \cd6, d3[0]
+ vmlal.s32 q4, \cd7, d3[0]
+ vmull.s32 q11, \cd0, d3[0]
+ vmull.s32 q5, \cd1, d3[0]
+ vmlsl.s32 q11, \cd4, d2[0]
+ vmlsl.s32 q5, \cd5, d2[0]
+ vsub.s32 \c0, \c0, \c2
+ vmlsl.s32 q11, \cd6, d2[1]
+ vmlsl.s32 q5, \cd7, d2[1]
+ vadd.s32 \c0, \c0, \c3
+ vmull.s32 q13, \cd2, d3[1]
+ vmull.s32 q6, \cd3, d3[1]
+ vmull.s32 q12, \cd0, d3[1]
+ vmull.s32 q7, \cd1, d3[1]
+ vadd.s64 q14, q10, q13
+ vadd.s64 q2, q4, q6
+ vadd.s64 q15, q11, q13
+ vadd.s64 q3, q5, q6
+ vrshrn.s64 \cd1, q2, #14
+ vrshrn.s64 \cd0, q14, #14
+ vadd.s64 q10, q10, q11
+ vadd.s64 q4, q4, q5
+ vrshrn.s64 \cd3, q3, #14
+ vrshrn.s64 \cd2, q15, #14
+ vsub.s64 q10, q10, q13
+ vsub.s64 q4, q4, q6
+ vrshrn.s64 \cd4, q12, #14
+ vrshrn.s64 \cd5, q7, #14
+ vrshrn.s64 \cd6, q10, #14
+ vrshrn.s64 \cd7, q4, #14
+.endm
+
+@ The public functions in this file have got the following signature:
+@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2, bpp
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+ movrel r12, itxfm4_coeffs
+ vld1.16 {d0}, [r12,:64]
+ vmovl.s16 q0, d0
+.endif
+.ifc \txfm1,iadst
+ movrel r12, iadst4_coeffs
+ vld1.16 {d1}, [r12,:64]
+ vmovl.s16 q1, d1
+.endif
+.else
+ movrel r12, itxfm4_coeffs
+ vld1.16 {q0}, [r12,:128]
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+.endif
+.if \bpp > 10
+.ifnc \txfm1\()_\txfm2,idct_idct
+ @ iadst4_12 needs q4-q7
+ vpush {q4-q7}
+.endif
+.endif
+
+ vmov.i32 q14, #0
+ vmov.i32 q15, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp r3, #1
+ bne 1f
+ @ DC-only for idct/idct
+ vld1.32 {d4[]}, [r2,:32]
+ vmull.s32 q2, d4, d0[0]
+ vrshrn.s64 d4, q2, #14
+ vmull.s32 q2, d4, d0[0]
+ vrshrn.s64 d4, q2, #14
+ vst1.32 {d30[0]}, [r2,:32]
+ vdup.32 q2, d4[0]
+ vmov q3, q2
+ vmov q8, q2
+ vmov q9, q2
+ b 2f
+.endif
+
+1:
+ vld1.32 {q2-q3}, [r2,:128]
+ vst1.32 {q14-q15}, [r2,:128]!
+ vld1.32 {q8-q9}, [r2,:128]
+
+.ifc \txfm1,iwht
+ vshr.s32 q2, q2, #2
+ vshr.s32 q3, q3, #2
+ vshr.s32 q8, q8, #2
+ vshr.s32 q9, q9, #2
+.endif
+
+ vst1.16 {q14-q15}, [r2,:128]!
+ \txfm1\()4_\bpp q2, q3, q8, q9, d4, d5, d6, d7, d16, d17, d18, d19
+
+ @ Transpose 4x4 with 32 bit elements
+ vtrn.32 q2, q3
+ vtrn.32 q8, q9
+ vswp d5, d16
+ vswp d7, d18
+
+ \txfm2\()4_\bpp q2, q3, q8, q9, d4, d5, d6, d7, d16, d17, d18, d19
+2:
+ vmvn.u16 q15, #((0xffff << \bpp) & 0xffff)
+ vld1.16 {d0}, [r0,:64], r1
+ vld1.16 {d1}, [r0,:64], r1
+.ifnc \txfm1,iwht
+ vrshr.s32 q2, q2, #4
+ vrshr.s32 q3, q3, #4
+ vrshr.s32 q8, q8, #4
+ vrshr.s32 q9, q9, #4
+.endif
+ vaddw.u16 q2, q2, d0
+ vaddw.u16 q3, q3, d1
+ vld1.16 {d2}, [r0,:64], r1
+ vld1.16 {d3}, [r0,:64], r1
+ vqmovun.s32 d0, q2
+ vqmovun.s32 d1, q3
+ sub r0, r0, r1, lsl #2
+
+ vaddw.u16 q8, q8, d2
+ vmin.u16 q0, q0, q15
+ vaddw.u16 q9, q9, d3
+ vst1.16 {d0}, [r0,:64], r1
+ vqmovun.s32 d2, q8
+ vqmovun.s32 d3, q9
+ vmin.u16 q1, q1, q15
+
+ vst1.16 {d1}, [r0,:64], r1
+ vst1.16 {d2}, [r0,:64], r1
+ vst1.16 {d3}, [r0,:64], r1
+
+.if \bpp > 10
+.ifnc \txfm1\()_\txfm2,idct_idct
+ vpop {q4-q7}
+.endif
+.endif
+ bx lr
+endfunc
+.endm
+
+.macro itxfm_funcs4x4 bpp
+itxfm_func4x4 idct, idct, \bpp
+itxfm_func4x4 iadst, idct, \bpp
+itxfm_func4x4 idct, iadst, \bpp
+itxfm_func4x4 iadst, iadst, \bpp
+itxfm_func4x4 iwht, iwht, \bpp
+.endm
+
+itxfm_funcs4x4 10
+itxfm_funcs4x4 12
+
+.macro idct8
+ dmbutterfly0 d16, d17, d24, d25, q8, q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
+ dmbutterfly d20, d21, d28, d29, d1[0], d1[1], q2, q3, q4, q5 @ q10 = t2a, q14 = t3a
+ dmbutterfly d18, d19, d30, d31, d2[0], d2[1], q2, q3, q4, q5 @ q9 = t4a, q15 = t7a
+ dmbutterfly d26, d27, d22, d23, d3[0], d3[1], q2, q3, q4, q5 @ q13 = t5a, q11 = t6a
+
+ butterfly q2, q14, q8, q14 @ q2 = t0, q14 = t3
+ butterfly q3, q10, q12, q10 @ q3 = t1, q10 = t2
+ butterfly q4, q13, q9, q13 @ q4 = t4, q13 = t5a
+ butterfly q5, q11, q15, q11 @ q5 = t7, q11 = t6a
+
+ butterfly q8, q15, q2, q5 @ q8 = out[0], q15 = out[7]
+
+ dmbutterfly0 d4, d5, d10, d11, q11, q13, q9, q13, d18, d19, d26, d27, q2, q5, q11, q12 @ q2 = t6, q5 = t5
+
+ butterfly q11, q12, q14, q4 @ q11 = out[3], q12 = out[4]
+ butterfly q9, q14, q3, q2 @ q9 = out[1], q14 = out[6]
+ butterfly_r q13, q10, q10, q5 @ q13 = out[5], q10 = out[2]
+.endm
+
+.macro iadst8
+ movrel r12, iadst8_coeffs
+ vld1.16 {q1}, [r12,:128]!
+ vmovl.s16 q0, d2
+ vmovl.s16 q1, d3
+
+ dmbutterfly_l q4, q5, q2, q3, d30, d31, d16, d17, d0[1], d0[0] @ q4,q5 = t1a, q2,q3 = t0a
+ dmbutterfly_l q8, q15, q6, q7, d22, d23, d24, d25, d2[1], d2[0] @ q8,q15 = t5a, q6,q7 = t4a
+
+ dbutterfly_n d22, d23, d4, d5, q2, q3, q6, q7, q11, q12, q2, q3 @ q11 = t0, q2 = t4
+
+ dbutterfly_n d24, d25, d6, d7, q4, q5, q8, q15, q12, q3, q6, q7 @ q12 = t1, q3 = t5
+
+ dmbutterfly_l q6, q7, q4, q5, d26, d27, d20, d21, d1[1], d1[0] @ q6,q7 = t3a, q4,q5 = t2a
+ dmbutterfly_l q10, q13, q8, q15, d18, d19, d28, d29, d3[1], d3[0] @ q10,q13 = t7a, q8,q15 = t6a
+
+ dbutterfly_n d18, d19, d8, d9, q4, q5, q8, q15, q9, q14, q4, q5 @ q9 = t2, q4 = t6
+ dbutterfly_n d16, d17, d12, d13, q6, q7, q10, q13, q8, q15, q6, q7 @ q8 = t3, q6 = t7
+
+ movrel r12, idct_coeffs
+ vld1.16 {q0}, [r12,:128]
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+
+ butterfly q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
+ vneg.s32 q15, q15 @ q15 = out[7]
+ butterfly q8, q9, q11, q9 @ q8 = out[0], q9 = t2
+
+ dmbutterfly_l q10, q11, q5, q7, d4, d5, d6, d7, d1[0], d1[1] @ q10,q11 = t5a, q5,q7 = t4a
+ dmbutterfly_l q2, q3, q13, q14, d12, d13, d8, d9, d1[1], d1[0] @ q2,q3 = t6a, q13,q14 = t7a
+
+ dbutterfly_n d28, d29, d8, d9, q10, q11, q13, q14, q4, q6, q10, q11 @ q14 = out[6], q4 = t7
+
+ dmbutterfly0 d22, d23, d24, d25, q9, q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
+ vneg.s32 q11, q11 @ q11 = out[3]
+
+ dbutterfly_n d18, d19, d4, d5, q5, q7, q2, q3, q9, q10, q2, q3 @ q9 = -out[1], q2 = t6
+ vneg.s32 q9, q9 @ q9 = out[1]
+
+ dmbutterfly0 d20, d21, d26, d27, q2, q4, q3, q5, d6, d7, d10, d11, q6, q7 @ q10 = out[2], q13 = -out[5]
+ vneg.s32 q13, q13 @ q13 = out[5]
+.endm
+
+function idct8x8_dc_add_neon
+ movrel r12, idct_coeffs
+ vld1.16 {d0}, [r12,:64]
+
+ vmov.i32 q2, #0
+ vmovl.s16 q0, d0
+
+ vld1.32 {d16[]}, [r2,:32]
+ vmull.s32 q8, d16, d0[0]
+ vrshrn.s64 d16, q8, #14
+ vmull.s32 q8, d16, d0[0]
+ vrshrn.s64 d16, q8, #14
+ vdup.32 q8, d16[0]
+ vst1.32 {d4[0]}, [r2,:32]
+
+ vrshr.s32 q8, q8, #5
+ vdup.s16 q15, r8
+
+ mov r3, r0
+ mov r12, #8
+1:
+ @ Loop to add the constant from q8 into all 8x8 outputs
+ subs r12, r12, #2
+ vld1.16 {q2}, [r0,:128], r1
+ vaddw.u16 q10, q8, d4
+ vld1.16 {q3}, [r0,:128], r1
+ vaddw.u16 q11, q8, d5
+ vaddw.u16 q12, q8, d6
+ vaddw.u16 q13, q8, d7
+ vqmovun.s32 d4, q10
+ vqmovun.s32 d5, q11
+ vqmovun.s32 d6, q12
+ vqmovun.s32 d7, q13
+ vmin.u16 q2, q2, q15
+ vst1.16 {q2}, [r3,:128], r1
+ vmin.u16 q3, q3, q15
+ vst1.16 {q3}, [r3,:128], r1
+ bne 1b
+
+ pop {r4-r8,pc}
+endfunc
+.ltorg
+
+.macro itxfm8_1d_funcs txfm
+@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it,
+@ transpose into a horizontal 8x4 slice and store.
+@ r0 = dst (temp buffer)
+@ r1 = slice offset
+@ r2 = src
+function \txfm\()8_1d_4x8_pass1_neon
+ mov r12, #32
+ vmov.s32 q2, #0
+.irp i, 8, 9, 10, 11, 12, 13, 14, 15
+ vld1.32 {q\i}, [r2,:128]
+ vst1.32 {q2}, [r2,:128], r12
+.endr
+
+ \txfm\()8
+
+ @ Do two 4x4 transposes. Originally, q8-q15 contain the
+ @ 8 rows. Afterwards, q8-q11, q12-q15 contain the transposed
+ @ 4x4 blocks.
+ transpose32_q_2x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ @ Store the transposed 4x4 blocks horizontally.
+ cmp r1, #4
+ beq 1f
+.irp i, 8, 12, 9, 13, 10, 14, 11, 15
+ vst1.32 {q\i}, [r0,:128]!
+.endr
+ bx lr
+1:
+ @ Special case: For the last input column (r1 == 4),
+ @ which would be stored as the last row in the temp buffer,
+ @ don't store the first 4x4 block, but keep it in registers
+ @ for the first slice of the second pass (where it is the
+ @ last 4x4 block).
+.irp i, 12, 13, 14, 15
+ add r0, r0, #16
+ vst1.32 {q\i}, [r0,:128]!
+.endr
+ vmov q12, q8
+ vmov q13, q9
+ vmov q14, q10
+ vmov q15, q11
+ bx lr
+endfunc
+
+@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it,
+@ load the destination pixels (from a similar 4x8 slice), add and store back.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+@ r3 = slice offset
+function \txfm\()8_1d_4x8_pass2_neon
+ mov r12, #32
+.irp i, 8, 9, 10, 11
+ vld1.32 {q\i}, [r2,:128], r12
+.endr
+ cmp r3, #0
+ beq 1f
+.irp i, 12, 13, 14, 15
+ vld1.32 {q\i}, [r2,:128], r12
+.endr
+1:
+
+ add r3, r0, r1
+ lsl r1, r1, #1
+ \txfm\()8
+
+ vdup.s16 q4, r8
+.macro load_add_store coef0, coef1, coef2, coef3
+ vld1.16 {d4}, [r0,:64], r1
+ vld1.16 {d5}, [r3,:64], r1
+ vld1.16 {d6}, [r0,:64], r1
+ vld1.16 {d7}, [r3,:64], r1
+
+ vrshr.s32 \coef0, \coef0, #5
+ vrshr.s32 \coef1, \coef1, #5
+ vrshr.s32 \coef2, \coef2, #5
+ vrshr.s32 \coef3, \coef3, #5
+
+ vaddw.u16 \coef0, \coef0, d4
+ vaddw.u16 \coef1, \coef1, d5
+ vaddw.u16 \coef2, \coef2, d6
+ vaddw.u16 \coef3, \coef3, d7
+
+ sub r0, r0, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ vqmovun.s32 d4, \coef0
+ vqmovun.s32 d5, \coef1
+ vqmovun.s32 d6, \coef2
+ vqmovun.s32 d7, \coef3
+
+ vmin.u16 q2, q2, q4
+ vmin.u16 q3, q3, q4
+
+ vst1.16 {d4}, [r0,:64], r1
+ vst1.16 {d5}, [r3,:64], r1
+ vst1.16 {d6}, [r0,:64], r1
+ vst1.16 {d7}, [r3,:64], r1
+.endm
+ load_add_store q8, q9, q10, q11
+ load_add_store q12, q13, q14, q15
+.purgem load_add_store
+
+ bx lr
+endfunc
+.endm
+
+itxfm8_1d_funcs idct
+itxfm8_1d_funcs iadst
+
+.macro itxfm_func8x8 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp r3, #1
+ beq idct8x8_dc_add_neon
+.endif
+.ifnc \txfm1\()_\txfm2,idct_idct
+ vpush {q4-q7}
+.else
+ vpush {q4-q5}
+.endif
+
+ @ Align the stack, allocate a temp buffer
+T mov r7, sp
+T and r7, r7, #15
+A and r7, sp, #15
+ add r7, r7, #256
+ sub sp, sp, r7
+
+ mov r4, r0
+ mov r5, r1
+ mov r6, r2
+
+.ifc \txfm1,idct
+ movrel r12, idct_coeffs
+ vld1.16 {q0}, [r12,:128]
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+.endif
+
+.irp i, 0, 4
+ add r0, sp, #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i == 4
+ cmp r3, #12
+ ble 1f
+.endif
+.endif
+ mov r1, #\i
+ add r2, r6, #(\i*4)
+ bl \txfm1\()8_1d_4x8_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,idct_idct
+ b 3f
+1:
+ @ For all-zero slices in pass 1, set q12-q15 to zero, for the in-register
+ @ passthrough of coefficients to pass 2 and clear the end of the temp buffer
+ vmov.i32 q12, #0
+ vmov.i32 q13, #0
+ vmov.i32 q14, #0
+ vmov.i32 q15, #0
+.rept 4
+ vst1.32 {q12-q13}, [r0,:128]!
+.endr
+3:
+.endif
+.ifc \txfm1\()_\txfm2,iadst_idct
+ movrel r12, idct_coeffs
+ vld1.16 {q0}, [r12,:128]
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+.endif
+.irp i, 0, 4
+ add r0, r4, #(\i*2)
+ mov r1, r5
+ add r2, sp, #(\i*4)
+ mov r3, #\i
+ bl \txfm2\()8_1d_4x8_pass2_neon
+.endr
+
+ add sp, sp, r7
+.ifnc \txfm1\()_\txfm2,idct_idct
+ vpop {q4-q7}
+.else
+ vpop {q4-q5}
+.endif
+ pop {r4-r8,pc}
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
+ push {r4-r8,lr}
+ movw r8, #0x03ff
+ b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
+ push {r4-r8,lr}
+ movw r8, #0x0fff
+ b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+.endm
+
+itxfm_func8x8 idct, idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct, iadst
+itxfm_func8x8 iadst, iadst
+
+function idct16x16_dc_add_neon
+ movrel r12, idct_coeffs
+ vld1.16 {d0}, [r12,:64]
+
+ vmov.i32 q2, #0
+ vmovl.s16 q0, d0
+
+ vld1.32 {d16[]}, [r2,:32]
+ vmull.s32 q8, d16, d0[0]
+ vrshrn.s64 d16, q8, #14
+ vmull.s32 q8, d16, d0[0]
+ vrshrn.s64 d16, q8, #14
+ vdup.32 q8, d16[0]
+ vst1.32 {d4[0]}, [r2,:32]
+
+ vrshr.s32 q8, q8, #6
+ vdup.s16 q15, r9
+
+ mov r3, r0
+ mov r12, #16
+1:
+ @ Loop to add the constant from q8 into all 16x16 outputs
+ subs r12, r12, #2
+ vld1.16 {q0-q1}, [r0,:128], r1
+ vaddw.u16 q9, q8, d0
+ vaddw.u16 q10, q8, d1
+ vld1.16 {q2-q3}, [r0,:128], r1
+ vaddw.u16 q11, q8, d2
+ vaddw.u16 q12, q8, d3
+ vaddw.u16 q13, q8, d4
+ vaddw.u16 q14, q8, d5
+ vqmovun.s32 d0, q9
+ vaddw.u16 q9, q8, d6
+ vqmovun.s32 d1, q10
+ vaddw.u16 q10, q8, d7
+ vqmovun.s32 d2, q11
+ vqmovun.s32 d3, q12
+ vqmovun.s32 d4, q13
+ vqmovun.s32 d5, q14
+ vmin.u16 q0, q0, q15
+ vmin.u16 q1, q1, q15
+ vqmovun.s32 d6, q9
+ vqmovun.s32 d7, q10
+ vst1.16 {q0-q1}, [r3,:128], r1
+ vmin.u16 q2, q2, q15
+ vmin.u16 q3, q3, q15
+ vst1.16 {q2-q3}, [r3,:128], r1
+ bne 1b
+
+ pop {r4-r9,pc}
+endfunc
+.ltorg
+
+.macro idct16_end
+ butterfly d18, d11, d8, d11 @ d18 = t0a, d11 = t7a
+ butterfly d19, d22, d9, d22 @ d19 = t1a, d22 = t6
+ butterfly d8, d26, d20, d26 @ d8 = t2a, d26 = t5
+ butterfly d9, d10, d28, d10 @ d9 = t3a, d10 = t4
+ butterfly d20, d28, d16, d24 @ d20 = t8a, d28 = t11a
+ butterfly d24, d21, d23, d21 @ d24 = t9, d21 = t10
+ butterfly d23, d27, d25, d27 @ d23 = t14, d27 = t13
+ butterfly d25, d29, d29, d17 @ d25 = t15a, d29 = t12a
+
+ mbutterfly0 d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
+ mbutterfly0 d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12, d28 = t11
+
+ vswp d27, d29 @ d27 = t12, d29 = t13a
+ vswp d28, d27 @ d28 = t12, d27 = t11
+ butterfly d16, d31, d18, d25 @ d16 = out[0], d31 = out[15]
+ butterfly d17, d30, d19, d23 @ d17 = out[1], d30 = out[14]
+ butterfly_r d25, d22, d22, d24 @ d25 = out[9], d22 = out[6]
+ butterfly d23, d24, d11, d20 @ d23 = out[7], d24 = out[8]
+ butterfly d18, d29, d8, d29 @ d18 = out[2], d29 = out[13]
+ butterfly d19, d28, d9, d28 @ d19 = out[3], d28 = out[12]
+ vmov d8, d21 @ d8 = t10a
+ butterfly d20, d27, d10, d27 @ d20 = out[4], d27 = out[11]
+ butterfly d21, d26, d26, d8 @ d21 = out[5], d26 = out[10]
+ bx lr
+.endm
+
+function idct16
+ mbutterfly0 d16, d24, d16, d24, d8, d10, q4, q5 @ d16 = t0a, d24 = t1a
+ mbutterfly d20, d28, d1[0], d1[1], q4, q5 @ d20 = t2a, d28 = t3a
+ mbutterfly d18, d30, d2[0], d2[1], q4, q5 @ d18 = t4a, d30 = t7a
+ mbutterfly d26, d22, d3[0], d3[1], q4, q5 @ d26 = t5a, d22 = t6a
+ mbutterfly d17, d31, d4[0], d4[1], q4, q5 @ d17 = t8a, d31 = t15a
+ mbutterfly d25, d23, d5[0], d5[1], q4, q5 @ d25 = t9a, d23 = t14a
+ mbutterfly d21, d27, d6[0], d6[1], q4, q5 @ d21 = t10a, d27 = t13a
+ mbutterfly d29, d19, d7[0], d7[1], q4, q5 @ d29 = t11a, d19 = t12a
+
+ butterfly d8, d28, d16, d28 @ d8 = t0, d28 = t3
+ butterfly d9, d20, d24, d20 @ d9 = t1, d20 = t2
+ butterfly d10, d26, d18, d26 @ d10 = t4, d26 = t5
+ butterfly d11, d22, d30, d22 @ d11 = t7, d22 = t6
+ butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9
+ butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10
+ butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13
+ butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14
+
+ mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a
+ mbutterfly d23, d25, d1[0], d1[1], q9, q15 @ d23 = t9a, d25 = t14a
+ mbutterfly d27, d21, d1[0], d1[1], q9, q15, neg=1 @ d27 = t13a, d21 = t10a
+ idct16_end
+endfunc
+
+function idct16_half
+ mbutterfly0_h d16, d24, d16, d24, d8, d10, q4, q5 @ d16 = t0a, d24 = t1a
+ mbutterfly_h1 d20, d28, d1[0], d1[1], q4, q5 @ d20 = t2a, d28 = t3a
+ mbutterfly_h1 d18, d30, d2[0], d2[1], q4, q5 @ d18 = t4a, d30 = t7a
+ mbutterfly_h2 d26, d22, d3[0], d3[1], q4, q5 @ d26 = t5a, d22 = t6a
+ mbutterfly_h1 d17, d31, d4[0], d4[1], q4, q5 @ d17 = t8a, d31 = t15a
+ mbutterfly_h2 d25, d23, d5[0], d5[1], q4, q5 @ d25 = t9a, d23 = t14a
+ mbutterfly_h1 d21, d27, d6[0], d6[1], q4, q5 @ d21 = t10a, d27 = t13a
+ mbutterfly_h2 d29, d19, d7[0], d7[1], q4, q5 @ d29 = t11a, d19 = t12a
+
+ butterfly d8, d28, d16, d28 @ d8 = t0, d28 = t3
+ butterfly d9, d20, d24, d20 @ d9 = t1, d20 = t2
+ butterfly d10, d26, d18, d26 @ d10 = t4, d26 = t5
+ butterfly d11, d22, d30, d22 @ d11 = t7, d22 = t6
+ butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9
+ butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10
+ butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13
+ butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14
+
+ mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a
+ mbutterfly d23, d25, d1[0], d1[1], q9, q15 @ d23 = t9a, d25 = t14a
+ mbutterfly d27, d21, d1[0], d1[1], q9, q15, neg=1 @ d27 = t13a, d21 = t10a
+ idct16_end
+endfunc
+
+function idct16_quarter
+ vmov.s64 q12, #0
+ vmull.s32 q4, d17, d4[0]
+ vmull.s32 q5, d18, d2[1]
+ vmull.s32 q15, d18, d2[0]
+ vmlsl.s32 q12, d19, d7[1]
+ vmull.s32 q14, d17, d4[1]
+ vmull.s32 q13, d19, d7[0]
+ vmull.s32 q11, d16, d0[0]
+ vrshrn.s64 d16, q4, #14
+ vrshrn.s64 d11, q5, #14
+ vrshrn.s64 d10, q15, #14
+ vrshrn.s64 d24, q12, #14
+ vrshrn.s64 d29, q14, #14
+ vrshrn.s64 d17, q13, #14
+ vrshrn.s64 d28, q11, #14
+
+ mbutterfly_l q10, q11, d17, d24, d1[0], d1[1], neg=1
+ mbutterfly_l q9, q15, d29, d16, d1[0], d1[1]
+ vrshrn.s64 d27, q10, #14
+ vrshrn.s64 d21, q11, #14
+ vrshrn.s64 d23, q9, #14
+ vrshrn.s64 d25, q15, #14
+ vmov d8, d28
+ vmov d9, d28
+ mbutterfly0 d22, d26, d11, d10, d18, d30, q9, q15
+ vmov d20, d28
+ idct16_end
+endfunc
+
+function iadst16
+ movrel r12, iadst16_coeffs
+ vld1.16 {q0}, [r12,:128]!
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+
+ mbutterfly_l q3, q2, d31, d16, d0[1], d0[0] @ q3 = t1, q2 = t0
+ mbutterfly_l q5, q4, d23, d24, d2[1], d2[0] @ q5 = t9, q4 = t8
+ butterfly_n d31, d24, q3, q5, q6, q5 @ d31 = t1a, d24 = t9a
+ mbutterfly_l q7, q6, d29, d18, d1[1], d1[0] @ q7 = t3, q6 = t2
+ butterfly_n d16, d23, q2, q4, q3, q4 @ d16 = t0a, d23 = t8a
+ mbutterfly_l q3, q2, d21, d26, d3[1], d3[0] @ q3 = t11, q2 = t10
+
+ vld1.16 {q0}, [r12,:128]!
+ butterfly_n d29, d26, q7, q3, q4, q3 @ d29 = t3a, d26 = t11a
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+ mbutterfly_l q5, q4, d27, d20, d0[1], d0[0] @ q5 = t5, q4 = t4
+ butterfly_n d18, d21, q6, q2, q3, q2 @ d18 = t2a, d21 = t10a
+
+ mbutterfly_l q7, q6, d19, d28, d2[1], d2[0] @ q7 = t13, q6 = t12
+ butterfly_n d20, d28, q5, q7, q2, q7 @ d20 = t5a, d28 = t13a
+ mbutterfly_l q3, q2, d25, d22, d1[1], d1[0] @ q3 = t7, q2 = t6
+ butterfly_n d27, d19, q4, q6, q5, q6 @ d27 = t4a, d19 = t12a
+
+ mbutterfly_l q5, q4, d17, d30, d3[1], d3[0] @ q5 = t15, q4 = t14
+ movrel r12, idct_coeffs
+ vld1.16 {q0}, [r12,:128]
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+ butterfly_n d22, d30, q3, q5, q6, q5 @ d22 = t7a, d30 = t15a
+ mbutterfly_l q7, q6, d23, d24, d2[0], d2[1] @ q7 = t9, q6 = t8
+ butterfly_n d25, d17, q2, q4, q3, q4 @ d25 = t6a, d17 = t14a
+
+ mbutterfly_l q2, q3, d28, d19, d2[1], d2[0] @ q2 = t12, q3 = t13
+ butterfly_n d23, d19, q6, q2, q4, q2 @ d23 = t8a, d19 = t12a
+ mbutterfly_l q5, q4, d21, d26, d3[0], d3[1] @ q5 = t11, q4 = t10
+ butterfly_r d4, d27, d16, d27 @ d4 = t4, d27 = t0
+ butterfly_n d24, d28, q7, q3, q6, q3 @ d24 = t9a, d28 = t13a
+
+ mbutterfly_l q6, q7, d30, d17, d3[1], d3[0] @ q6 = t14, q7 = t15
+ butterfly_r d5, d20, d31, d20 @ d5 = t5, d20 = t1
+ butterfly_n d21, d17, q4, q6, q3, q6 @ d21 = t10a, d17 = t14a
+ butterfly_n d26, d30, q5, q7, q4, q7 @ d26 = t11a, d30 = t15a
+
+ butterfly_r d6, d25, d18, d25 @ d6 = t6, d25 = t2
+ butterfly_r d7, d22, d29, d22 @ d7 = t7, d22 = t3
+
+ mbutterfly_l q5, q4, d19, d28, d1[0], d1[1] @ q5 = t13, q4 = t12
+ mbutterfly_l q6, q7, d30, d17, d1[1], d1[0] @ q6 = t14, q7 = t15
+
+ butterfly_n d18, d30, q4, q6, q8, q6 @ d18 = out[2], d30 = t14a
+ butterfly_n d29, d17, q5, q7, q6, q7 @ d29 = -out[13], d17 = t15a
+ vneg.s32 d29, d29 @ d29 = out[13]
+
+ mbutterfly_l q5, q4, d4, d5, d1[0], d1[1] @ q5 = t5a, q4 = t4a
+ mbutterfly_l q6, q7, d7, d6, d1[1], d1[0] @ q6 = t6a, q7 = t7a
+
+ butterfly d2, d6, d27, d25 @ d2 = out[0], d6 = t2a
+ butterfly d3, d7, d23, d21 @ d3 =-out[1], d7 = t10
+
+ butterfly_n d19, d31, q4, q6, q2, q4 @ d19 = -out[3], d31 = t6
+ vneg.s32 d19, d19 @ d19 = out[3]
+ butterfly_n d28, d16, q5, q7, q2, q5 @ d28 = out[12], d16 = t7
+
+ butterfly d5, d8, d20, d22 @ d5 =-out[15],d8 = t3a
+ butterfly d4, d9, d24, d26 @ d4 = out[14],d9 = t11
+
+ mbutterfly0 d23, d24, d6, d8, d10, d11, q6, q7, 1 @ d23 = out[7], d24 = out[8]
+ mbutterfly0 d20, d27, d16, d31, d10, d11, q6, q7 @ d20 = out[4], d27 = out[11]
+ mbutterfly0 d22, d25, d9, d7, d10, d11, q6, q7 @ d22 = out[6], d25 = out[9]
+ mbutterfly0 d21, d26, d30, d17, d10, d11, q6, q7, 1 @ d21 = out[5], d26 = out[10]
+
+ vneg.s32 d31, d5 @ d31 = out[15]
+ vneg.s32 d17, d3 @ d17 = out[1]
+
+ vmov d16, d2
+ vmov d30, d4
+ bx lr
+endfunc
+
+.macro itxfm16_1d_funcs txfm, suffix
+@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
+@ transpose into a horizontal 16x2 slice and store.
+@ r0 = dst (temp buffer)
+@ r2 = src
+function \txfm\()16_1d_2x16_pass1\suffix\()_neon
+ push {lr}
+
+ mov r12, #64
+ vmov.s32 q4, #0
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.32 {d\i}, [r2,:64]
+ vst1.32 {d8}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ vld1.32 {d\i}, [r2,:64]
+ vst1.32 {d8}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.32 {d\i}, [r2,:64]
+ vst1.32 {d8}, [r2,:64], r12
+.endr
+.endif
+
+ bl \txfm\()16\suffix
+
+ @ Do eight 2x2 transposes. Originally, d16-d31 contain the
+ @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
+ @ transposed 2x2 blocks.
+ transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ @ Store the transposed 2x2 blocks horizontally.
+.irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31
+ vst1.32 {d\i}, [r0,:64]!
+.endr
+ pop {pc}
+endfunc
+
+@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
+@ load the destination pixels (from a similar 2x16 slice), add and store back.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+function \txfm\()16_1d_2x16_pass2\suffix\()_neon
+ push {lr}
+
+ mov r12, #64
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19, 20
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+.endif
+
+ add r3, r0, r1
+ lsl r1, r1, #1
+ bl \txfm\()16\suffix
+
+.macro load_add_store coef0, coef1, coef2, coef3
+ vrshr.s32 \coef0, \coef0, #6
+ vrshr.s32 \coef1, \coef1, #6
+
+ vld1.32 {d8[]}, [r0,:32], r1
+ vld1.32 {d8[1]}, [r3,:32], r1
+ vrshr.s32 \coef2, \coef2, #6
+ vrshr.s32 \coef3, \coef3, #6
+ vld1.32 {d9[]}, [r0,:32], r1
+ vld1.32 {d9[1]}, [r3,:32], r1
+ vaddw.u16 \coef0, \coef0, d8
+ vld1.32 {d10[]}, [r0,:32], r1
+ vld1.32 {d10[1]}, [r3,:32], r1
+ vaddw.u16 \coef1, \coef1, d9
+ vld1.32 {d11[]}, [r0,:32], r1
+ vld1.32 {d11[1]}, [r3,:32], r1
+
+ vqmovun.s32 d8, \coef0
+ vdup.s16 q8, r9
+ vqmovun.s32 d9, \coef1
+ sub r0, r0, r1, lsl #2
+ sub r3, r3, r1, lsl #2
+ vaddw.u16 \coef2, \coef2, d10
+ vaddw.u16 \coef3, \coef3, d11
+ vmin.u16 q4, q4, q8
+ vst1.32 {d8[0]}, [r0,:32], r1
+ vst1.32 {d8[1]}, [r3,:32], r1
+ vqmovun.s32 d10, \coef2
+ vst1.32 {d9[0]}, [r0,:32], r1
+ vst1.32 {d9[1]}, [r3,:32], r1
+ vqmovun.s32 d11, \coef3
+ vmin.u16 q5, q5, q8
+
+ vst1.32 {d10[0]}, [r0,:32], r1
+ vst1.32 {d10[1]}, [r3,:32], r1
+ vst1.32 {d11[0]}, [r0,:32], r1
+ vst1.32 {d11[1]}, [r3,:32], r1
+.endm
+ load_add_store q8, q9, q10, q11
+ load_add_store q12, q13, q14, q15
+.purgem load_add_store
+
+ pop {pc}
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+itxfm16_1d_funcs idct, _quarter
+itxfm16_1d_funcs idct, _half
+.ltorg
+
+@ This is the minimum eob value for each subpartition, in increments of 2
+const min_eob_idct_idct_16, align=4
+ .short 0, 3, 10, 22, 38, 62, 89, 121
+endconst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp r3, #1
+ beq idct16x16_dc_add_neon
+.endif
+.ifnc \txfm1\()_\txfm2,idct_idct
+ vpush {q4-q7}
+.else
+ vpush {q4-q5}
+.endif
+
+ @ Align the stack, allocate a temp buffer
+T mov r7, sp
+T and r7, r7, #15
+A and r7, sp, #15
+ add r7, r7, #1024
+ sub sp, sp, r7
+
+ mov r4, r0
+ mov r5, r1
+ mov r6, r2
+
+.ifc \txfm1,idct
+ movrel r12, idct_coeffs
+ vld1.16 {q0-q1}, [r12,:128]
+ vmovl.s16 q2, d2
+ vmovl.s16 q3, d3
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp r3, #10
+ ble idct16x16_quarter_add_16_neon
+ cmp r3, #38
+ ble idct16x16_half_add_16_neon
+
+ movrel r8, min_eob_idct_idct_16 + 2
+.endif
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r0, sp, #(\i*64)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+ ldrh_post r1, r8, #2
+ cmp r3, r1
+ it le
+ movle r1, #(16 - \i)/2
+ ble 1f
+.endif
+.endif
+ add r2, r6, #(\i*4)
+ bl \txfm1\()16_1d_2x16_pass1_neon
+.endr
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ b 3f
+1:
+ vmov.i32 q14, #0
+ vmov.i32 q15, #0
+2:
+ subs r1, r1, #1
+ @ Unroll for 2 lines
+.rept 2
+ @ Fill one line with zeros
+ vst1.32 {q14-q15}, [r0,:128]!
+ vst1.32 {q14-q15}, [r0,:128]!
+.endr
+ bne 2b
+3:
+.endif
+
+.ifc \txfm1\()_\txfm2,iadst_idct
+ movrel r12, idct_coeffs
+ vld1.16 {q0-q1}, [r12,:128]
+ vmovl.s16 q2, d2
+ vmovl.s16 q3, d3
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+.endif
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r0, r4, #(\i*2)
+ mov r1, r5
+ add r2, sp, #(\i*4)
+ bl \txfm2\()16_1d_2x16_pass2_neon
+.endr
+
+ add sp, sp, r7
+.ifnc \txfm1\()_\txfm2,idct_idct
+ vpop {q4-q7}
+.else
+ vpop {q4-q5}
+.endif
+ pop {r4-r9,pc}
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
+ push {r4-r9,lr}
+ movw r9, #0x03ff
+ b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
+ push {r4-r9,lr}
+ movw r9, #0x0fff
+ b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+.endm
+
+itxfm_func16x16 idct, idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct, iadst
+itxfm_func16x16 iadst, iadst
+.ltorg
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_16_neon
+.irp i, 0, 2
+ add r0, sp, #(\i*64)
+.ifc \size,quarter
+.if \i == 2
+ cmp r3, #3
+ ble 1f
+.endif
+.endif
+ add r2, r6, #(\i*4)
+ bl idct16_1d_2x16_pass1_\size\()_neon
+.endr
+
+.ifc \size,half
+.irp i, 4, 6
+ add r0, sp, #(\i*64)
+.if \i == 6
+ cmp r3, #22
+ ble 1f
+.endif
+ add r2, r6, #(\i*4)
+ bl idct16_1d_2x16_pass1_\size\()_neon
+.endr
+.endif
+
+ b 3f
+1:
+ vmov.i32 q14, #0
+ vmov.i32 q15, #0
+
+ @ Unroll for 2 lines
+.rept 2
+ @ Fill one line with zeros
+ vst1.32 {q14-q15}, [r0,:128]!
+ vst1.32 {q14-q15}, [r0,:128]!
+.endr
+
+3:
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+ add r0, r4, #(\i*2)
+ mov r1, r5
+ add r2, sp, #(\i*4)
+ bl idct16_1d_2x16_pass2_\size\()_neon
+.endr
+
+ add sp, sp, r7
+ vpop {q4-q5}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+ movrel r12, idct_coeffs
+ vld1.16 {d0}, [r12,:64]
+
+ vmov.i32 q2, #0
+ vmovl.s16 q0, d0
+
+ vld1.32 {d16[]}, [r2,:32]
+ vmull.s32 q8, d16, d0[0]
+ vrshrn.s64 d16, q8, #14
+ vmull.s32 q8, d16, d0[0]
+ vrshrn.s64 d16, q8, #14
+ vdup.32 q8, d16[0]
+ vst1.32 {d4[0]}, [r2,:32]
+
+ vrshr.s32 q8, q8, #6
+ vdup.s16 q15, r9
+
+ mov r3, r0
+ mov r12, #32
+ sub r1, r1, #32
+1:
+ @ Loop to add the constant from q8 into all 32x32 outputs
+ subs r12, r12, #1
+ vld1.16 {q0-q1}, [r0,:128]!
+ vaddw.u16 q9, q8, d0
+ vaddw.u16 q10, q8, d1
+ vld1.16 {q2-q3}, [r0,:128], r1
+ vaddw.u16 q11, q8, d2
+ vaddw.u16 q12, q8, d3
+ vaddw.u16 q13, q8, d4
+ vaddw.u16 q14, q8, d5
+ vqmovun.s32 d0, q9
+ vaddw.u16 q9, q8, d6
+ vqmovun.s32 d1, q10
+ vaddw.u16 q10, q8, d7
+ vqmovun.s32 d2, q11
+ vqmovun.s32 d3, q12
+ vqmovun.s32 d4, q13
+ vqmovun.s32 d5, q14
+ vmin.u16 q0, q0, q15
+ vmin.u16 q1, q1, q15
+ vqmovun.s32 d6, q9
+ vqmovun.s32 d7, q10
+ vst1.16 {q0-q1}, [r3,:128]!
+ vmin.u16 q2, q2, q15
+ vmin.u16 q3, q3, q15
+ vst1.16 {q2-q3}, [r3,:128], r1
+ bne 1b
+
+ pop {r4-r9,pc}
+endfunc
+
+.macro idct32_end
+ butterfly d16, d9, d8, d9 @ d16 = t16a, d9 = t19a
+ butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18
+ butterfly d18, d10, d11, d10 @ d18 = t23a, d10 = t20a
+ butterfly d19, d21, d22, d21 @ d19 = t22, d21 = t21
+ butterfly d8, d28, d28, d30 @ d8 = t24a, d28 = t27a
+ butterfly d23, d26, d25, d26 @ d23 = t25, d26 = t26
+ butterfly d11, d29, d29, d31 @ d11 = t31a, d29 = t28a
+ butterfly d22, d27, d24, d27 @ d22 = t30, d27 = t29
+
+ mbutterfly d27, d20, d1[0], d1[1], q12, q15 @ d27 = t18a, d20 = t29a
+ mbutterfly d29, d9, d1[0], d1[1], q12, q15 @ d29 = t19, d9 = t28
+ mbutterfly d28, d10, d1[0], d1[1], q12, q15, neg=1 @ d28 = t27, d10 = t20
+ mbutterfly d26, d21, d1[0], d1[1], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
+
+ butterfly d31, d24, d11, d8 @ d31 = t31, d24 = t24
+ butterfly d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
+ butterfly_r d23, d16, d16, d18 @ d23 = t23, d16 = t16
+ butterfly_r d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
+ butterfly d18, d21, d27, d21 @ d18 = t18, d21 = t21
+ butterfly_r d27, d28, d9, d28 @ d27 = t27a, d28 = t28a
+ butterfly d8, d26, d20, d26 @ d8 = t29, d26 = t26
+ butterfly d19, d20, d29, d10 @ d19 = t19a, d20 = t20
+ vmov d29, d8 @ d29 = t29
+
+ mbutterfly0 d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27, d20 = t20
+ mbutterfly0 d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
+ mbutterfly0 d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25, d22 = t22
+ mbutterfly0 d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
+ bx lr
+.endm
+
+function idct32_odd
+ movrel r12, idct_coeffs
+
+ @ Overwrite the idct16 coeffs with the stored ones for idct32
+ vmovl.s16 q0, d12
+ vmovl.s16 q1, d13
+ vmovl.s16 q2, d14
+ vmovl.s16 q3, d15
+
+ mbutterfly d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a
+ mbutterfly d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a
+ mbutterfly d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a
+ mbutterfly d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a
+ mbutterfly d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a
+ mbutterfly d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a
+ mbutterfly d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a
+ mbutterfly d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a
+
+ @ Reload the idct16 coefficients. We could swap the coefficients between
+ @ q0-q3 and q6-q7 by narrowing/lengthening, but that's slower than just
+ @ loading and lengthening.
+ vld1.16 {q0-q1}, [r12,:128]
+
+ butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17
+ butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18
+ butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21
+ butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22
+ vmovl.s16 q2, d2
+ vmovl.s16 q3, d3
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+ butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25
+ butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26
+ butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30
+ butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+ mbutterfly d23, d24, d2[0], d2[1], q8, q9 @ d23 = t17a, d24 = t30a
+ mbutterfly d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+ mbutterfly d21, d26, d3[0], d3[1], q8, q9 @ d21 = t21a, d26 = t26a
+ mbutterfly d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+ idct32_end
+endfunc
+
+function idct32_odd_half
+ movrel r12, idct_coeffs
+
+ vmovl.s16 q0, d12
+ vmovl.s16 q1, d13
+ vmovl.s16 q2, d14
+ vmovl.s16 q3, d15
+
+ mbutterfly_h1 d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a
+ mbutterfly_h2 d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a
+ mbutterfly_h1 d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a
+ mbutterfly_h2 d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a
+ mbutterfly_h1 d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a
+ mbutterfly_h2 d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a
+ mbutterfly_h1 d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a
+ mbutterfly_h2 d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a
+
+ vld1.16 {q0-q1}, [r12,:128]
+
+ butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17
+ butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18
+ butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21
+ butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22
+ vmovl.s16 q2, d2
+ vmovl.s16 q3, d3
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+ butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25
+ butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26
+ butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30
+ butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+ mbutterfly d23, d24, d2[0], d2[1], q8, q9 @ d23 = t17a, d24 = t30a
+ mbutterfly d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+ mbutterfly d21, d26, d3[0], d3[1], q8, q9 @ d21 = t21a, d26 = t26a
+ mbutterfly d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+ idct32_end
+endfunc
+
+function idct32_odd_quarter
+ movrel r12, idct_coeffs
+
+ vmovl.s16 q0, d12
+ vmovl.s16 q1, d13
+ vmovl.s16 q2, d14
+ vmovl.s16 q3, d15
+
+ vmov.s64 q14, #0
+ vmov.s64 q5, #0
+
+ vmull.s32 q4, d16, d0[0]
+ vmlsl.s32 q14, d19, d3[1]
+ vmull.s32 q15, d16, d0[1]
+ vmull.s32 q11, d17, d7[0]
+ vmlsl.s32 q5, d17, d7[1]
+ vmull.s32 q13, d19, d3[0]
+ vmull.s32 q10, d18, d4[0]
+ vmull.s32 q12, d18, d4[1]
+
+ vld1.16 {q0-q1}, [r12,:128]
+
+ vrshrn.s64 d8, q4, #14
+ vrshrn.s64 d9, q14, #14
+ vrshrn.s64 d29, q15, #14
+ vrshrn.s64 d28, q11, #14
+
+ vmovl.s16 q2, d2
+ vmovl.s16 q3, d3
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+
+ vrshrn.s64 d11, q5, #14
+ vrshrn.s64 d31, q13, #14
+ vrshrn.s64 d10, q10, #14
+ vrshrn.s64 d30, q12, #14
+
+ mbutterfly_l q8, q9, d29, d8, d2[0], d2[1]
+ mbutterfly_l q13, q10, d31, d9, d2[0], d2[1], neg=1
+ vrshrn.s64 d23, q8, #14
+ vrshrn.s64 d24, q9, #14
+ vrshrn.s64 d27, q13, #14
+ vrshrn.s64 d20, q10, #14
+ mbutterfly_l q8, q9, d30, d10, d3[0], d3[1]
+ vrshrn.s64 d21, q8, #14
+ vrshrn.s64 d26, q9, #14
+ mbutterfly_l q8, q9, d28, d11, d3[0], d3[1], neg=1
+ vrshrn.s64 d25, q8, #14
+ vrshrn.s64 d22, q9, #14
+
+ idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix.
+@ We don't have register space to do a single pass IDCT of 2x32 though,
+@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
+@ a normal IDCT16 with every other input component (the even ones, with
+@ each output written twice), followed by a separate 16-point IDCT
+@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
+@ r0 = dst (temp buffer)
+@ r1 = unused
+@ r2 = src
+function idct32_1d_2x32_pass1\suffix\()_neon
+ push {lr}
+
+ @ Double stride of the input, since we only read every other line
+ mov r12, #256
+ vmov.s32 d8, #0
+
+ @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.32 {d\i}, [r2,:64]
+ vst1.32 {d8}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ vld1.32 {d\i}, [r2,:64]
+ vst1.32 {d8}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.32 {d\i}, [r2,:64]
+ vst1.32 {d8}, [r2,:64], r12
+.endr
+.endif
+
+ bl idct16\suffix
+
+ @ Do eight 2x2 transposes. Originally, d16-d31 contain the
+ @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
+ @ transposed 2x2 blocks.
+ transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ @ Store the registers a, b, c, d, e, f, g, h horizontally, followed
+ @ by the same registers h, g, f, e, d, c, b, a mirrored.
+.macro store_rev a, b, c, d, e, f, g, h
+.irp i, \a, \b, \c, \d, \e, \f, \g, \h
+ vst1.32 {d\i}, [r0,:64]!
+ vrev64.32 d\i, d\i
+.endr
+.irp i, \h, \g, \f, \e, \d, \c, \b, \a
+ vst1.32 {d\i}, [r0,:64]!
+.endr
+.endm
+ store_rev 16, 18, 20, 22, 24, 26, 28, 30
+ store_rev 17, 19, 21, 23, 25, 27, 29, 31
+ sub r0, r0, #256
+.purgem store_rev
+
+ @ Move r2 back to the start of the input, and move
+ @ to the first odd row
+.ifb \suffix
+ sub r2, r2, r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+ sub r2, r2, r12, lsl #2
+.endif
+.ifc \suffix,_half
+ sub r2, r2, r12, lsl #3
+.endif
+ add r2, r2, #128
+
+ vmov.s32 d8, #0
+ @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d8}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d8}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d8}, [r2,:64], r12
+.endr
+.endif
+
+ bl idct32_odd\suffix
+
+ transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+
+ @ Store the registers a, b, c, d, e, f, g, h horizontally,
+ @ adding into the output first, and then mirrored, subtracted
+ @ from the output.
+.macro store_rev a, b, c, d, e, f, g, h
+.irp i, \a, \b, \c, \d, \e, \f, \g, \h
+ vld1.32 {d8}, [r0,:64]
+ vadd.s32 d8, d8, d\i
+ vst1.32 {d8}, [r0,:64]!
+ vrev64.32 d\i, d\i
+.endr
+.irp i, \h, \g, \f, \e, \d, \c, \b, \a
+ vld1.32 {d8}, [r0,:64]
+ vsub.s32 d8, d8, d\i
+ vst1.32 {d8}, [r0,:64]!
+.endr
+.endm
+
+ store_rev 31, 29, 27, 25, 23, 21, 19, 17
+ store_rev 30, 28, 26, 24, 22, 20, 18, 16
+.purgem store_rev
+ pop {pc}
+endfunc
+.ltorg
+
+@ This is mostly the same as 2x32_pass1, but without the transpose,
+@ and use the source as temp buffer between the two idct passes, and
+@ add into the destination.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+function idct32_1d_2x32_pass2\suffix\()_neon
+ push {lr}
+
+ mov r12, #256
+ @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.32 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ vld1.32 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.32 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #3
+.endif
+
+ bl idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vst1.32 {d\i}, [r2,:64], r12
+.endr
+
+ sub r2, r2, r12, lsl #4
+ add r2, r2, #128
+
+ @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.32 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ vld1.32 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.32 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #3
+.endif
+ sub r2, r2, #128
+
+ bl idct32_odd\suffix
+
+ @ Narrow the ict16 coefficients in q0-q3 into q0-q1, to
+ @ allow clobbering q2-q3 below.
+ vmovn.s32 d0, q0
+ vmovn.s32 d1, q1
+ vmovn.s32 d2, q2
+ vmovn.s32 d3, q3
+
+ mov r12, #256
+ vdup.s16 q4, r9
+.macro load_acc_store a, b, c, d, neg=0
+ vld1.32 {d4}, [r2,:64], r12
+ vld1.32 {d5}, [r2,:64], r12
+.if \neg == 0
+ vadd.s32 d4, d4, d\a
+ vld1.32 {d6}, [r2,:64], r12
+ vadd.s32 d5, d5, d\b
+ vld1.32 {d7}, [r2,:64], r12
+ vadd.s32 d6, d6, d\c
+ vadd.s32 d7, d7, d\d
+.else
+ vsub.s32 d4, d4, d\a
+ vld1.32 {d6}, [r2,:64], r12
+ vsub.s32 d5, d5, d\b
+ vld1.32 {d7}, [r2,:64], r12
+ vsub.s32 d6, d6, d\c
+ vsub.s32 d7, d7, d\d
+.endif
+ vld1.32 {d10[]}, [r0,:32], r1
+ vld1.32 {d10[1]}, [r0,:32], r1
+ vrshr.s32 q2, q2, #6
+ vld1.32 {d11[]}, [r0,:32], r1
+ vrshr.s32 q3, q3, #6
+ vld1.32 {d11[1]}, [r0,:32], r1
+ sub r0, r0, r1, lsl #2
+ vaddw.u16 q2, q2, d10
+ vaddw.u16 q3, q3, d11
+ vqmovun.s32 d4, q2
+ vqmovun.s32 d5, q3
+ vmin.u16 q2, q2, q4
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r0,:32], r1
+ vst1.32 {d5[0]}, [r0,:32], r1
+ vst1.32 {d5[1]}, [r0,:32], r1
+.endm
+ load_acc_store 31, 30, 29, 28
+ load_acc_store 27, 26, 25, 24
+ load_acc_store 23, 22, 21, 20
+ load_acc_store 19, 18, 17, 16
+ sub r2, r2, r12
+ neg r12, r12
+ load_acc_store 16, 17, 18, 19, 1
+ load_acc_store 20, 21, 22, 23, 1
+ load_acc_store 24, 25, 26, 27, 1
+ load_acc_store 28, 29, 30, 31, 1
+.purgem load_acc_store
+ @ Lengthen the idct16 coeffs back into 32 bit form
+ vmovl.s16 q2, d2
+ vmovl.s16 q3, d3
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+ pop {pc}
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+ .short 0, 3, 9, 21, 34, 51, 70, 98, 135, 176, 240, 258, 336, 357, 448, 472
+endconst
+
+function vp9_idct_idct_32x32_add_16_neon
+ cmp r3, #1
+ beq idct32x32_dc_add_neon
+ vpush {q4-q7}
+ movrel r8, min_eob_idct_idct_32 + 2
+
+ @ Align the stack, allocate a temp buffer
+T mov r7, sp
+T and r7, r7, #15
+A and r7, sp, #15
+ add r7, r7, #4096
+ sub sp, sp, r7
+
+ mov r4, r0
+ mov r5, r1
+ mov r6, r2
+
+ movrel r12, idct_coeffs
+ vld1.16 {q0-q1}, [r12,:128]!
+ vld1.16 {q6-q7}, [r12,:128]
+ vmovl.s16 q2, d2
+ vmovl.s16 q3, d3
+ vmovl.s16 q1, d1
+ vmovl.s16 q0, d0
+
+ cmp r3, #34
+ ble idct32x32_quarter_add_16_neon
+ cmp r3, #135
+ ble idct32x32_half_add_16_neon
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r0, sp, #(\i*128)
+.if \i > 0
+ ldrh_post r1, r8, #2
+ cmp r3, r1
+ it le
+ movle r1, #(32 - \i)/2
+ ble 1f
+.endif
+ add r2, r6, #(\i*4)
+ bl idct32_1d_2x32_pass1_neon
+.endr
+ b 3f
+
+1:
+ @ Write zeros to the temp buffer for pass 2
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+2:
+ subs r1, r1, #1
+.rept 2
+ @ Fill one line with zeros
+ vst1.16 {q14-q15}, [r0,:128]!
+ vst1.16 {q14-q15}, [r0,:128]!
+ vst1.16 {q14-q15}, [r0,:128]!
+ vst1.16 {q14-q15}, [r0,:128]!
+.endr
+ bne 2b
+3:
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r0, r4, #(\i*2)
+ mov r1, r5
+ add r2, sp, #(\i*4)
+ bl idct32_1d_2x32_pass2_neon
+.endr
+
+ add sp, sp, r7
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_10_neon, export=1
+ push {r4-r9,lr}
+ movw r9, #0x03ff
+ b vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_12_neon, export=1
+ push {r4-r9,lr}
+ movw r9, #0x0fff
+ b vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+.macro idct32_partial size, rows
+function idct32x32_\size\()_add_16_neon
+.irp i, 0, 2, 4, 6
+ add r0, sp, #(\i*128)
+.ifc \size,quarter
+.if \i > 0
+ ldrh_post r1, r8, #2
+ cmp r3, r1
+ it le
+ movle r1, #(\rows - \i)/2
+ ble 1f
+.endif
+.endif
+ add r2, r6, #(\i*4)
+ bl idct32_1d_2x32_pass1_\size\()_neon
+.endr
+.ifc \size,half
+ add r8, r8, #8
+.irp i, 8, 10, 12, 14
+ add r0, sp, #(\i*128)
+.if \i > 8
+ ldrh_post r1, r8, #2
+ cmp r3, r1
+ it le
+ movle r1, #(\rows - \i)/2
+ ble 1f
+.endif
+ add r2, r6, #(\i*4)
+ bl idct32_1d_2x32_pass1_\size\()_neon
+.endr
+.endif
+ b 3f
+
+1:
+ @ Write zeros to the temp buffer for pass 2
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+2:
+ subs r1, r1, #1
+.rept 2
+ @ Fill one line with zeros
+ vst1.16 {q14-q15}, [r0,:128]!
+ vst1.16 {q14-q15}, [r0,:128]!
+ vst1.16 {q14-q15}, [r0,:128]!
+ vst1.16 {q14-q15}, [r0,:128]!
+.endr
+ bne 2b
+3:
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ add r0, r4, #(\i*2)
+ mov r1, r5
+ add r2, sp, #(\i*4)
+ bl idct32_1d_2x32_pass2_\size\()_neon
+.endr
+
+ add sp, sp, r7
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+idct32_partial quarter, 8
+idct32_partial half, 16
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
new file mode 100644
index 0000000000..6c09922cae
--- /dev/null
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -0,0 +1,1688 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+ .short 11585, 0, 6270, 15137
+iadst4_coeffs:
+ .short 5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+ .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+ .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+ .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+ .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+ .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+ .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+ .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+@ Do four 4x4 transposes, using q registers for the subtransposes that don't
+@ need to address the individual d registers.
+@ r0,r1 == rq1, r2,r3 == rq1, etc
+.macro transpose16_q_4x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ vtrn.32 \rq0, \rq1
+ vtrn.32 \rq2, \rq3
+ vtrn.32 \rq4, \rq5
+ vtrn.32 \rq6, \rq7
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+ vtrn.16 \r4, \r5
+ vtrn.16 \r6, \r7
+ vtrn.16 \r8, \r9
+ vtrn.16 \r10, \r11
+ vtrn.16 \r12, \r13
+ vtrn.16 \r14, \r15
+.endm
+
+@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ in/out are d registers
+.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
+ vadd.s16 \tmpd1, \in1, \in2
+ vsub.s16 \tmpd2, \in1, \in2
+ vmull.s16 \tmpq3, \tmpd1, d0[0]
+ vmull.s16 \tmpq4, \tmpd2, d0[0]
+.if \neg > 0
+ vneg.s32 \tmpq3, \tmpq3
+.endif
+ vrshrn.s32 \out1, \tmpq3, #14
+ vrshrn.s32 \out2, \tmpq4, #14
+.endm
+
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+ vmull.s16 \tmpq3, \in1, d0[0]
+ vrshrn.s32 \out1, \tmpq3, #14
+ vrshrn.s32 \out2, \tmpq3, #14
+.endm
+
+@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ Same as mbutterfly0, but with input being 2 q registers, output
+@ being 4 d registers.
+@ This can do with either 4 or 6 temporary q registers.
+.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
+ vadd.s16 \tmpq1, \in1, \in2
+ vsub.s16 \tmpq2, \in1, \in2
+ vmull.s16 \tmpq3, \tmpd11, d0[0]
+ vmull.s16 \tmpq4, \tmpd12, d0[0]
+.ifb \tmpq5
+ vrshrn.s32 \out1, \tmpq3, #14
+ vrshrn.s32 \out2, \tmpq4, #14
+ vmull.s16 \tmpq3, \tmpd21, d0[0]
+ vmull.s16 \tmpq4, \tmpd22, d0[0]
+ vrshrn.s32 \out3, \tmpq3, #14
+ vrshrn.s32 \out4, \tmpq4, #14
+.else
+ vmull.s16 \tmpq5, \tmpd21, d0[0]
+ vmull.s16 \tmpq6, \tmpd22, d0[0]
+ vrshrn.s32 \out1, \tmpq3, #14
+ vrshrn.s32 \out2, \tmpq4, #14
+ vrshrn.s32 \out3, \tmpq5, #14
+ vrshrn.s32 \out4, \tmpq6, #14
+.endif
+.endm
+
+@ out1 = in1 * coef1 - in2 * coef2
+@ out2 = in1 * coef2 + in2 * coef1
+@ out are 2 q registers, in are 2 d registers
+.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2
+ vmull.s16 \out1, \in1, \coef1
+ vmlsl.s16 \out1, \in2, \coef2
+ vmull.s16 \out2, \in1, \coef2
+ vmlal.s16 \out2, \in2, \coef1
+.endm
+
+@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
+@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
+@ out are 4 q registers, in are 4 d registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
+ vmull.s16 \out1, \in1, \coef1
+ vmull.s16 \out2, \in2, \coef1
+ vmull.s16 \out3, \in1, \coef2
+ vmull.s16 \out4, \in2, \coef2
+ vmlsl.s16 \out1, \in3, \coef2
+ vmlsl.s16 \out2, \in4, \coef2
+ vmlal.s16 \out3, \in3, \coef1
+ vmlal.s16 \out4, \in4, \coef1
+.endm
+
+@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+@ inout are 2 d registers, tmp are 2 q registers
+.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0
+ mbutterfly_l \tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2
+.if \neg > 0
+ vneg.s32 \tmp2, \tmp2
+.endif
+ vrshrn.s32 \inout1, \tmp1, #14
+ vrshrn.s32 \inout2, \tmp2, #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout2 as zero
+.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
+ vmull.s16 \tmp1, \inout1, \coef1
+ vmull.s16 \tmp2, \inout1, \coef2
+ vrshrn.s32 \inout1, \tmp1, #14
+ vrshrn.s32 \inout2, \tmp2, #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout1 as zero
+.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
+ vmull.s16 \tmp1, \inout2, \coef2
+ vmull.s16 \tmp2, \inout2, \coef1
+ vneg.s32 \tmp1, \tmp1
+ vrshrn.s32 \inout2, \tmp2, #14
+ vrshrn.s32 \inout1, \tmp1, #14
+.endm
+
+@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
+@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
+@ inout are 4 d registers, tmp are 4 q registers
+.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
+ vrshrn.s32 \inout1, \tmp1, #14
+ vrshrn.s32 \inout2, \tmp2, #14
+ vrshrn.s32 \inout3, \tmp3, #14
+ vrshrn.s32 \inout4, \tmp4, #14
+.endm
+
+@ out1 = in1 + in2
+@ out2 = in1 - in2
+.macro butterfly out1, out2, in1, in2
+ vadd.s16 \out1, \in1, \in2
+ vsub.s16 \out2, \in1, \in2
+.endm
+
+@ out1 = in1 - in2
+@ out2 = in1 + in2
+.macro butterfly_r out1, out2, in1, in2
+ vsub.s16 \out1, \in1, \in2
+ vadd.s16 \out2, \in1, \in2
+.endm
+
+@ out1 = (in1 + in2 + (1 << 13)) >> 14
+@ out2 = (in1 - in2 + (1 << 13)) >> 14
+@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
+.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
+ vadd.s32 \tmp1, \in1, \in2
+ vsub.s32 \tmp2, \in1, \in2
+ vrshrn.s32 \out1, \tmp1, #14
+ vrshrn.s32 \out2, \tmp2, #14
+.endm
+
+@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
+.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+ vadd.s32 \tmp1, \in1, \in3
+ vadd.s32 \tmp2, \in2, \in4
+ vsub.s32 \tmp3, \in1, \in3
+ vsub.s32 \tmp4, \in2, \in4
+ vrshrn.s32 \out1, \tmp1, #14
+ vrshrn.s32 \out2, \tmp2, #14
+ vrshrn.s32 \out3, \tmp3, #14
+ vrshrn.s32 \out4, \tmp4, #14
+.endm
+
+
+.macro iwht4 c0, c1, c2, c3
+ vadd.i16 \c0, \c0, \c1
+ vsub.i16 d17, \c2, \c3
+ vsub.i16 d16, \c0, d17
+ vshr.s16 d16, d16, #1
+ vsub.i16 \c2, d16, \c1
+ vsub.i16 \c1, d16, \c3
+ vadd.i16 \c3, d17, \c2
+ vsub.i16 \c0, \c0, \c1
+.endm
+
+.macro idct4 c0, c1, c2, c3
+ vmull.s16 q13, \c1, d0[3]
+ vmull.s16 q11, \c1, d0[2]
+ vadd.i16 d16, \c0, \c2
+ vsub.i16 d17, \c0, \c2
+ vmlal.s16 q13, \c3, d0[2]
+ vmull.s16 q9, d16, d0[0]
+ vmull.s16 q10, d17, d0[0]
+ vmlsl.s16 q11, \c3, d0[3]
+ vrshrn.s32 d26, q13, #14
+ vrshrn.s32 d18, q9, #14
+ vrshrn.s32 d20, q10, #14
+ vrshrn.s32 d22, q11, #14
+ vadd.i16 \c0, d18, d26
+ vsub.i16 \c3, d18, d26
+ vadd.i16 \c1, d20, d22
+ vsub.i16 \c2, d20, d22
+.endm
+
+.macro iadst4 c0, c1, c2, c3
+ vmull.s16 q10, \c0, d1[0]
+ vmlal.s16 q10, \c2, d1[1]
+ vmlal.s16 q10, \c3, d1[2]
+ vmull.s16 q11, \c0, d1[2]
+ vmlsl.s16 q11, \c2, d1[0]
+ vsub.s16 \c0, \c0, \c2
+ vmlsl.s16 q11, \c3, d1[1]
+ vadd.s16 \c0, \c0, \c3
+ vmull.s16 q13, \c1, d1[3]
+ vmull.s16 q12, \c0, d1[3]
+ vadd.s32 q14, q10, q13
+ vadd.s32 q1, q11, q13
+ vrshrn.s32 \c0, q14, #14
+ vadd.s32 q10, q10, q11
+ vrshrn.s32 \c1, q1, #14
+ vsub.s32 q10, q10, q13
+ vrshrn.s32 \c2, q12, #14
+ vrshrn.s32 \c3, q10, #14
+.endm
+
+@ The public functions in this file have got the following signature:
+@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+ movrel r12, itxfm4_coeffs
+ vld1.16 {d0}, [r12,:64]
+.endif
+.ifc \txfm1,iadst
+ movrel r12, iadst4_coeffs
+ vld1.16 {d1}, [r12,:64]
+.endif
+.else
+ movrel r12, itxfm4_coeffs
+ vld1.16 {q0}, [r12,:128]
+.endif
+
+ vmov.i16 q15, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp r3, #1
+ bne 1f
+ @ DC-only for idct/idct
+ vld1.16 {d4[]}, [r2,:16]
+ vmull.s16 q2, d4, d0[0]
+ vrshrn.s32 d4, q2, #14
+ vmull.s16 q2, d4, d0[0]
+ vrshrn.s32 d4, q2, #14
+ vst1.16 {d30[0]}, [r2,:16]
+ vdup.16 q2, d4[0]
+ vmov q3, q2
+ b 2f
+.endif
+
+1:
+ vld1.16 {d4-d7}, [r2,:128]
+ vst1.16 {q15}, [r2,:128]!
+
+.ifc \txfm1,iwht
+ vshr.s16 q2, q2, #2
+ vshr.s16 q3, q3, #2
+.endif
+
+ \txfm1\()4 d4, d5, d6, d7
+
+ vst1.16 {q15}, [r2,:128]!
+ @ Transpose 4x4 with 16 bit elements
+ vtrn.16 d4, d5
+ vtrn.16 d6, d7
+ vtrn.32 q2, q3
+
+ \txfm2\()4 d4, d5, d6, d7
+2:
+ vld1.32 {d0[]}, [r0,:32], r1
+ vld1.32 {d0[1]}, [r0,:32], r1
+.ifnc \txfm1,iwht
+ vrshr.s16 q2, q2, #4
+ vrshr.s16 q3, q3, #4
+.endif
+ vaddw.u8 q2, q2, d0
+ vld1.32 {d1[]}, [r0,:32], r1
+ vld1.32 {d1[1]}, [r0,:32], r1
+ vqmovun.s16 d0, q2
+ sub r0, r0, r1, lsl #2
+
+ vaddw.u8 q3, q3, d1
+ vst1.32 {d0[0]}, [r0,:32], r1
+ vqmovun.s16 d1, q3
+
+ vst1.32 {d0[1]}, [r0,:32], r1
+ vst1.32 {d1[0]}, [r0,:32], r1
+ vst1.32 {d1[1]}, [r0,:32], r1
+
+ bx lr
+endfunc
+.endm
+
+itxfm_func4x4 idct, idct
+itxfm_func4x4 iadst, idct
+itxfm_func4x4 idct, iadst
+itxfm_func4x4 iadst, iadst
+itxfm_func4x4 iwht, iwht
+
+
+.macro idct8
+ dmbutterfly0 d16, d17, d24, d25, q8, q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
+ dmbutterfly d20, d21, d28, d29, d0[2], d0[3], q2, q3, q4, q5 @ q10 = t2a, q14 = t3a
+ dmbutterfly d18, d19, d30, d31, d1[0], d1[1], q2, q3, q4, q5 @ q9 = t4a, q15 = t7a
+ dmbutterfly d26, d27, d22, d23, d1[2], d1[3], q2, q3, q4, q5 @ q13 = t5a, q11 = t6a
+
+ butterfly q2, q14, q8, q14 @ q2 = t0, q14 = t3
+ butterfly q3, q10, q12, q10 @ q3 = t1, q10 = t2
+ butterfly q4, q13, q9, q13 @ q4 = t4, q13 = t5a
+ butterfly q5, q11, q15, q11 @ q5 = t7, q11 = t6a
+
+ butterfly q8, q15, q2, q5 @ q8 = out[0], q15 = out[7]
+
+ dmbutterfly0 d4, d5, d10, d11, q11, q13, q9, q13, d18, d19, d26, d27, q2, q5, q11, q12 @ q2 = t6, q5 = t5
+
+ butterfly q11, q12, q14, q4 @ q11 = out[3], q12 = out[4]
+ butterfly q9, q14, q3, q2 @ q9 = out[1], q14 = out[6]
+ butterfly_r q13, q10, q10, q5 @ q13 = out[5], q10 = out[2]
+.endm
+
+.macro iadst8
+ dmbutterfly_l q4, q5, q2, q3, d30, d31, d16, d17, d2[1], d2[0] @ q4,q5 = t1a, q2,q3 = t0a
+ dmbutterfly_l q8, q15, q6, q7, d22, d23, d24, d25, d3[1], d3[0] @ q8,q15 = t5a, q6,q7 = t4a
+
+ dbutterfly_n d22, d23, d4, d5, q2, q3, q6, q7, q11, q12, q2, q3 @ q11 = t0, q2 = t4
+
+ dbutterfly_n d24, d25, d6, d7, q4, q5, q8, q15, q12, q3, q6, q7 @ q12 = t1, q3 = t5
+
+ dmbutterfly_l q6, q7, q4, q5, d26, d27, d20, d21, d2[3], d2[2] @ q6,q7 = t3a, q4,q5 = t2a
+ dmbutterfly_l q10, q13, q8, q15, d18, d19, d28, d29, d3[3], d3[2] @ q10,q13 = t7a, q8,q15 = t6a
+
+ dbutterfly_n d18, d19, d8, d9, q4, q5, q8, q15, q9, q14, q4, q5 @ q9 = t2, q4 = t6
+ dbutterfly_n d16, d17, d12, d13, q6, q7, q10, q13, q8, q15, q6, q7 @ q8 = t3, q6 = t7
+
+ butterfly q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
+ vneg.s16 q15, q15 @ q15 = out[7]
+ butterfly q8, q9, q11, q9 @ q8 = out[0], q9 = t2
+
+ dmbutterfly_l q10, q11, q5, q7, d4, d5, d6, d7, d0[2], d0[3] @ q10,q11 = t5a, q5,q7 = t4a
+ dmbutterfly_l q2, q3, q13, q14, d12, d13, d8, d9, d0[3], d0[2] @ q2,q3 = t6a, q13,q14 = t7a
+
+ dbutterfly_n d28, d29, d8, d9, q10, q11, q13, q14, q4, q6, q10, q11 @ q14 = out[6], q4 = t7
+
+ dmbutterfly0 d22, d23, d24, d25, q9, q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
+ vneg.s16 q11, q11 @ q11 = out[3]
+
+ dbutterfly_n d18, d19, d4, d5, q5, q7, q2, q3, q9, q10, q2, q3 @ q9 = -out[1], q2 = t6
+ vneg.s16 q9, q9 @ q9 = out[1]
+
+ dmbutterfly0 d20, d21, d26, d27, q2, q4, q3, q5, d6, d7, d10, d11, q6, q7 @ q10 = out[2], q13 = -out[5]
+ vneg.s16 q13, q13 @ q13 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
+ @ Push q4-q7 if iadst is used, idct requires
+ @ a few scratch registers less, so only push q4-q5
+ @ if only idct is involved.
+ @ The iadst also uses a few coefficients from
+ @ idct, so those always need to be loaded.
+.ifc \txfm1\()_\txfm2,idct_idct
+ movrel r12, idct_coeffs
+ vpush {q4-q5}
+.else
+ movrel r12, iadst8_coeffs
+ vld1.16 {q1}, [r12,:128]!
+ vpush {q4-q7}
+.endif
+ vld1.16 {q0}, [r12,:128]
+
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp r3, #1
+ bne 1f
+ @ DC-only for idct/idct
+ vld1.16 {d16[]}, [r2,:16]
+ vmull.s16 q8, d16, d0[0]
+ vrshrn.s32 d16, q8, #14
+ vmull.s16 q8, d16, d0[0]
+ vrshrn.s32 d16, q8, #14
+ vdup.16 q8, d16[0]
+ vmov q9, q8
+ vmov q10, q8
+ vmov q11, q8
+ vmov q12, q8
+ vmov q13, q8
+ vmov q14, q8
+ vmov q15, q8
+ vst1.16 {d4[0]}, [r2,:16]
+ b 2f
+.endif
+1:
+ vld1.16 {q8-q9}, [r2,:128]!
+ vld1.16 {q10-q11}, [r2,:128]!
+ vld1.16 {q12-q13}, [r2,:128]!
+ vld1.16 {q14-q15}, [r2,:128]!
+ sub r2, r2, #128
+ vst1.16 {q2-q3}, [r2,:128]!
+ vst1.16 {q2-q3}, [r2,:128]!
+ vst1.16 {q2-q3}, [r2,:128]!
+ vst1.16 {q2-q3}, [r2,:128]!
+
+ \txfm1\()8
+
+ @ Transpose 8x8 with 16 bit elements
+ vswp d17, d24
+ vswp d19, d26
+ vswp d21, d28
+ vswp d23, d30
+ transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
+
+ \txfm2\()8
+2:
+ mov r3, r0
+ @ Add into the destination
+ vld1.8 {d4}, [r0,:64], r1
+ vrshr.s16 q8, q8, #5
+ vld1.8 {d5}, [r0,:64], r1
+ vrshr.s16 q9, q9, #5
+ vld1.8 {d6}, [r0,:64], r1
+ vrshr.s16 q10, q10, #5
+ vaddw.u8 q8, q8, d4
+ vld1.8 {d7}, [r0,:64], r1
+ vrshr.s16 q11, q11, #5
+ vaddw.u8 q9, q9, d5
+ vld1.8 {d8}, [r0,:64], r1
+ vrshr.s16 q12, q12, #5
+ vaddw.u8 q10, q10, d6
+ vqmovun.s16 d4, q8
+ vld1.8 {d9}, [r0,:64], r1
+ vrshr.s16 q13, q13, #5
+ vaddw.u8 q11, q11, d7
+ vqmovun.s16 d5, q9
+ vld1.8 {d10}, [r0,:64], r1
+ vrshr.s16 q14, q14, #5
+ vaddw.u8 q12, q12, d8
+ vqmovun.s16 d6, q10
+ vld1.8 {d11}, [r0,:64], r1
+ vrshr.s16 q15, q15, #5
+ vaddw.u8 q13, q13, d9
+ vqmovun.s16 d7, q11
+
+
+ vst1.8 {d4}, [r3,:64], r1
+ vaddw.u8 q14, q14, d10
+ vst1.8 {d5}, [r3,:64], r1
+ vqmovun.s16 d8, q12
+ vst1.8 {d6}, [r3,:64], r1
+ vaddw.u8 q15, q15, d11
+ vst1.8 {d7}, [r3,:64], r1
+ vqmovun.s16 d9, q13
+ vst1.8 {d8}, [r3,:64], r1
+ vqmovun.s16 d10, q14
+ vst1.8 {d9}, [r3,:64], r1
+ vqmovun.s16 d11, q15
+
+ vst1.8 {d10}, [r3,:64], r1
+ vst1.8 {d11}, [r3,:64], r1
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ vpop {q4-q5}
+.else
+ vpop {q4-q7}
+.endif
+ bx lr
+endfunc
+.endm
+
+itxfm_func8x8 idct, idct
+itxfm_func8x8 iadst, idct
+.ltorg
+itxfm_func8x8 idct, iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+ movrel r12, idct_coeffs
+ vld1.16 {d0}, [r12,:64]
+
+ vmov.i16 q2, #0
+
+ vld1.16 {d16[]}, [r2,:16]
+ vmull.s16 q8, d16, d0[0]
+ vrshrn.s32 d16, q8, #14
+ vmull.s16 q8, d16, d0[0]
+ vrshrn.s32 d16, q8, #14
+ vdup.16 q8, d16[0]
+ vst1.16 {d4[0]}, [r2,:16]
+
+ vrshr.s16 q8, q8, #6
+
+ mov r3, r0
+ mov r12, #16
+1:
+ @ Loop to add the constant from q8 into all 16x16 outputs
+ subs r12, r12, #2
+ vld1.8 {q2}, [r0,:128], r1
+ vaddw.u8 q10, q8, d4
+ vld1.8 {q3}, [r0,:128], r1
+ vaddw.u8 q11, q8, d5
+ vaddw.u8 q12, q8, d6
+ vaddw.u8 q13, q8, d7
+ vqmovun.s16 d4, q10
+ vqmovun.s16 d5, q11
+ vqmovun.s16 d6, q12
+ vst1.8 {q2}, [r3,:128], r1
+ vqmovun.s16 d7, q13
+ vst1.8 {q3}, [r3,:128], r1
+ bne 1b
+
+ bx lr
+endfunc
+.ltorg
+
+.macro idct16_end
+ butterfly d18, d7, d4, d7 @ d18 = t0a, d7 = t7a
+ butterfly d19, d22, d5, d22 @ d19 = t1a, d22 = t6
+ butterfly d4, d26, d20, d26 @ d4 = t2a, d26 = t5
+ butterfly d5, d6, d28, d6 @ d5 = t3a, d6 = t4
+ butterfly d20, d28, d16, d24 @ d20 = t8a, d28 = t11a
+ butterfly d24, d21, d23, d21 @ d24 = t9, d21 = t10
+ butterfly d23, d27, d25, d27 @ d23 = t14, d27 = t13
+ butterfly d25, d29, d29, d17 @ d25 = t15a, d29 = t12a
+
+ mbutterfly0 d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
+ mbutterfly0 d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12, d28 = t11
+
+ vswp d27, d29 @ d27 = t12, d29 = t13a
+ vswp d28, d27 @ d28 = t12, d27 = t11
+ butterfly d16, d31, d18, d25 @ d16 = out[0], d31 = out[15]
+ butterfly d17, d30, d19, d23 @ d17 = out[1], d30 = out[14]
+ butterfly_r d25, d22, d22, d24 @ d25 = out[9], d22 = out[6]
+ butterfly d23, d24, d7, d20 @ d23 = out[7], d24 = out[8]
+ butterfly d18, d29, d4, d29 @ d18 = out[2], d29 = out[13]
+ butterfly d19, d28, d5, d28 @ d19 = out[3], d28 = out[12]
+ vmov d4, d21 @ d4 = t10a
+ butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11]
+ butterfly d21, d26, d26, d4 @ d21 = out[5], d26 = out[10]
+ bx lr
+.endm
+
+function idct16
+ mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a
+ mbutterfly d20, d28, d0[2], d0[3], q2, q3 @ d20 = t2a, d28 = t3a
+ mbutterfly d18, d30, d1[0], d1[1], q2, q3 @ d18 = t4a, d30 = t7a
+ mbutterfly d26, d22, d1[2], d1[3], q2, q3 @ d26 = t5a, d22 = t6a
+ mbutterfly d17, d31, d2[0], d2[1], q2, q3 @ d17 = t8a, d31 = t15a
+ mbutterfly d25, d23, d2[2], d2[3], q2, q3 @ d25 = t9a, d23 = t14a
+ mbutterfly d21, d27, d3[0], d3[1], q2, q3 @ d21 = t10a, d27 = t13a
+ mbutterfly d29, d19, d3[2], d3[3], q2, q3 @ d29 = t11a, d19 = t12a
+
+ butterfly d4, d28, d16, d28 @ d4 = t0, d28 = t3
+ butterfly d5, d20, d24, d20 @ d5 = t1, d20 = t2
+ butterfly d6, d26, d18, d26 @ d6 = t4, d26 = t5
+ butterfly d7, d22, d30, d22 @ d7 = t7, d22 = t6
+ butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9
+ butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10
+ butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13
+ butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14
+
+ mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a
+ mbutterfly d23, d25, d0[2], d0[3], q9, q15 @ d23 = t9a, d25 = t14a
+ mbutterfly d27, d21, d0[2], d0[3], q9, q15, neg=1 @ d27 = t13a, d21 = t10a
+ idct16_end
+endfunc
+
+function idct16_half
+ mbutterfly0_h d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a
+ mbutterfly_h1 d20, d28, d0[2], d0[3], q2, q3 @ d20 = t2a, d28 = t3a
+ mbutterfly_h1 d18, d30, d1[0], d1[1], q2, q3 @ d18 = t4a, d30 = t7a
+ mbutterfly_h2 d26, d22, d1[2], d1[3], q2, q3 @ d26 = t5a, d22 = t6a
+ mbutterfly_h1 d17, d31, d2[0], d2[1], q2, q3 @ d17 = t8a, d31 = t15a
+ mbutterfly_h2 d25, d23, d2[2], d2[3], q2, q3 @ d25 = t9a, d23 = t14a
+ mbutterfly_h1 d21, d27, d3[0], d3[1], q2, q3 @ d21 = t10a, d27 = t13a
+ mbutterfly_h2 d29, d19, d3[2], d3[3], q2, q3 @ d29 = t11a, d19 = t12a
+
+ butterfly d4, d28, d16, d28 @ d4 = t0, d28 = t3
+ butterfly d5, d20, d24, d20 @ d5 = t1, d20 = t2
+ butterfly d6, d26, d18, d26 @ d6 = t4, d26 = t5
+ butterfly d7, d22, d30, d22 @ d7 = t7, d22 = t6
+ butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9
+ butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10
+ butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13
+ butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14
+
+ mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a
+ mbutterfly d23, d25, d0[2], d0[3], q9, q15 @ d23 = t9a, d25 = t14a
+ mbutterfly d27, d21, d0[2], d0[3], q9, q15, neg=1 @ d27 = t13a, d21 = t10a
+ idct16_end
+endfunc
+
+function idct16_quarter
+ vmull.s16 q12, d19, d3[3]
+ vmull.s16 q2, d17, d2[0]
+ vmull.s16 q3, d18, d1[1]
+ vmull.s16 q15, d18, d1[0]
+ vneg.s32 q12, q12
+ vmull.s16 q14, d17, d2[1]
+ vmull.s16 q13, d19, d3[2]
+ vmull.s16 q11, d16, d0[0]
+ vrshrn.s32 d24, q12, #14
+ vrshrn.s32 d16, q2, #14
+ vrshrn.s32 d7, q3, #14
+ vrshrn.s32 d6, q15, #14
+ vrshrn.s32 d29, q14, #14
+ vrshrn.s32 d17, q13, #14
+ vrshrn.s32 d28, q11, #14
+
+ mbutterfly_l q10, q11, d17, d24, d0[2], d0[3]
+ mbutterfly_l q9, q15, d29, d16, d0[2], d0[3]
+ vneg.s32 q11, q11
+ vrshrn.s32 d27, q10, #14
+ vrshrn.s32 d21, q11, #14
+ vrshrn.s32 d23, q9, #14
+ vrshrn.s32 d25, q15, #14
+ vmov d4, d28
+ vmov d5, d28
+ mbutterfly0 d22, d26, d7, d6, d18, d30, q9, q15
+ vmov d20, d28
+ idct16_end
+endfunc
+
+function iadst16
+ movrel r12, iadst16_coeffs
+ vld1.16 {q0-q1}, [r12,:128]
+
+ mbutterfly_l q3, q2, d31, d16, d0[1], d0[0] @ q3 = t1, q2 = t0
+ mbutterfly_l q5, q4, d23, d24, d1[1], d1[0] @ q5 = t9, q4 = t8
+ butterfly_n d31, d24, q3, q5, q6, q5 @ d31 = t1a, d24 = t9a
+ mbutterfly_l q7, q6, d29, d18, d0[3], d0[2] @ q7 = t3, q6 = t2
+ butterfly_n d16, d23, q2, q4, q3, q4 @ d16 = t0a, d23 = t8a
+
+ mbutterfly_l q3, q2, d21, d26, d1[3], d1[2] @ q3 = t11, q2 = t10
+ butterfly_n d29, d26, q7, q3, q4, q3 @ d29 = t3a, d26 = t11a
+ mbutterfly_l q5, q4, d27, d20, d2[1], d2[0] @ q5 = t5, q4 = t4
+ butterfly_n d18, d21, q6, q2, q3, q2 @ d18 = t2a, d21 = t10a
+
+ mbutterfly_l q7, q6, d19, d28, d3[1], d3[0] @ q7 = t13, q6 = t12
+ butterfly_n d20, d28, q5, q7, q2, q7 @ d20 = t5a, d28 = t13a
+ mbutterfly_l q3, q2, d25, d22, d2[3], d2[2] @ q3 = t7, q2 = t6
+ butterfly_n d27, d19, q4, q6, q5, q6 @ d27 = t4a, d19 = t12a
+
+ mbutterfly_l q5, q4, d17, d30, d3[3], d3[2] @ q5 = t15, q4 = t14
+ movrel r12, idct_coeffs
+ vld1.16 {q0}, [r12,:128]
+ butterfly_n d22, d30, q3, q5, q6, q5 @ d22 = t7a, d30 = t15a
+ mbutterfly_l q7, q6, d23, d24, d1[0], d1[1] @ q7 = t9, q6 = t8
+ butterfly_n d25, d17, q2, q4, q3, q4 @ d25 = t6a, d17 = t14a
+
+ mbutterfly_l q2, q3, d28, d19, d1[1], d1[0] @ q2 = t12, q3 = t13
+ butterfly_n d23, d19, q6, q2, q4, q2 @ d23 = t8a, d19 = t12a
+ mbutterfly_l q5, q4, d21, d26, d1[2], d1[3] @ q5 = t11, q4 = t10
+ butterfly_r d4, d27, d16, d27 @ d4 = t4, d27 = t0
+ butterfly_n d24, d28, q7, q3, q6, q3 @ d24 = t9a, d28 = t13a
+
+ mbutterfly_l q6, q7, d30, d17, d1[3], d1[2] @ q6 = t14, q7 = t15
+ butterfly_r d5, d20, d31, d20 @ d5 = t5, d20 = t1
+ butterfly_n d21, d17, q4, q6, q3, q6 @ d21 = t10a, d17 = t14a
+ butterfly_n d26, d30, q5, q7, q4, q7 @ d26 = t11a, d30 = t15a
+
+ butterfly_r d6, d25, d18, d25 @ d6 = t6, d25 = t2
+ butterfly_r d7, d22, d29, d22 @ d7 = t7, d22 = t3
+
+ mbutterfly_l q5, q4, d19, d28, d0[2], d0[3] @ q5 = t13, q4 = t12
+ mbutterfly_l q6, q7, d30, d17, d0[3], d0[2] @ q6 = t14, q7 = t15
+
+ butterfly_n d18, d30, q4, q6, q8, q6 @ d18 = out[2], d30 = t14a
+ butterfly_n d29, d17, q5, q7, q6, q7 @ d29 = -out[13], d17 = t15a
+ vneg.s16 d29, d29 @ d29 = out[13]
+
+ mbutterfly_l q5, q4, d4, d5, d0[2], d0[3] @ q5 = t5a, q4 = t4a
+ mbutterfly_l q6, q7, d7, d6, d0[3], d0[2] @ q6 = t6a, q7 = t7a
+
+ butterfly d2, d6, d27, d25 @ d2 = out[0], d6 = t2a
+ butterfly d3, d7, d23, d21 @ d3 =-out[1], d7 = t10
+
+ butterfly_n d19, d31, q4, q6, q2, q4 @ d19 = -out[3], d31 = t6
+ vneg.s16 d19, d19 @ d19 = out[3]
+ butterfly_n d28, d16, q5, q7, q2, q5 @ d28 = out[12], d16 = t7
+
+ butterfly d5, d8, d20, d22 @ d5 =-out[15],d8 = t3a
+ butterfly d4, d9, d24, d26 @ d4 = out[14],d9 = t11
+
+ mbutterfly0 d23, d24, d6, d8, d10, d11, q6, q7, 1 @ d23 = out[7], d24 = out[8]
+ mbutterfly0 d20, d27, d16, d31, d10, d11, q6, q7 @ d20 = out[4], d27 = out[11]
+ mbutterfly0 d22, d25, d9, d7, d10, d11, q6, q7 @ d22 = out[6], d25 = out[9]
+ mbutterfly0 d21, d26, d30, d17, d10, d11, q6, q7, 1 @ d21 = out[5], d26 = out[10]
+
+ vneg.s16 d31, d5 @ d31 = out[15]
+ vneg.s16 d17, d3 @ d17 = out[1]
+
+ vmov d16, d2
+ vmov d30, d4
+ bx lr
+endfunc
+
+.macro load_add_store coef0, coef1, coef2, coef3
+ vrshr.s16 \coef0, \coef0, #6
+ vrshr.s16 \coef1, \coef1, #6
+
+ vld1.32 {d4[]}, [r0,:32], r1
+ vld1.32 {d4[1]}, [r3,:32], r1
+ vrshr.s16 \coef2, \coef2, #6
+ vrshr.s16 \coef3, \coef3, #6
+ vld1.32 {d5[]}, [r0,:32], r1
+ vld1.32 {d5[1]}, [r3,:32], r1
+ vaddw.u8 \coef0, \coef0, d4
+ vld1.32 {d6[]}, [r0,:32], r1
+ vld1.32 {d6[1]}, [r3,:32], r1
+ vaddw.u8 \coef1, \coef1, d5
+ vld1.32 {d7[]}, [r0,:32], r1
+ vld1.32 {d7[1]}, [r3,:32], r1
+
+ vqmovun.s16 d4, \coef0
+ vqmovun.s16 d5, \coef1
+ sub r0, r0, r1, lsl #2
+ sub r3, r3, r1, lsl #2
+ vaddw.u8 \coef2, \coef2, d6
+ vaddw.u8 \coef3, \coef3, d7
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r3,:32], r1
+ vqmovun.s16 d6, \coef2
+ vst1.32 {d5[0]}, [r0,:32], r1
+ vst1.32 {d5[1]}, [r3,:32], r1
+ vqmovun.s16 d7, \coef3
+
+ vst1.32 {d6[0]}, [r0,:32], r1
+ vst1.32 {d6[1]}, [r3,:32], r1
+ vst1.32 {d7[0]}, [r0,:32], r1
+ vst1.32 {d7[1]}, [r3,:32], r1
+.endm
+
+.macro itxfm16_1d_funcs txfm
+@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+@ transpose into a horizontal 16x4 slice and store.
+@ r0 = dst (temp buffer)
+@ r1 = slice offset
+@ r2 = src
+function \txfm\()16_1d_4x16_pass1_neon
+ push {lr}
+
+ mov r12, #32
+ vmov.s16 q2, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d4}, [r2,:64], r12
+.endr
+
+ bl \txfm\()16
+
+ @ Do four 4x4 transposes. Originally, d16-d31 contain the
+ @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+ @ contain the transposed 4x4 blocks.
+ transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ @ Store the transposed 4x4 blocks horizontally.
+ cmp r1, #12
+ beq 1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ pop {pc}
+1:
+ @ Special case: For the last input column (r1 == 12),
+ @ which would be stored as the last row in the temp buffer,
+ @ don't store the first 4x4 block, but keep it in registers
+ @ for the first slice of the second pass (where it is the
+ @ last 4x4 block).
+ add r0, r0, #8
+ vst1.16 {d20}, [r0,:64]!
+ vst1.16 {d24}, [r0,:64]!
+ vst1.16 {d28}, [r0,:64]!
+ add r0, r0, #8
+ vst1.16 {d21}, [r0,:64]!
+ vst1.16 {d25}, [r0,:64]!
+ vst1.16 {d29}, [r0,:64]!
+ add r0, r0, #8
+ vst1.16 {d22}, [r0,:64]!
+ vst1.16 {d26}, [r0,:64]!
+ vst1.16 {d30}, [r0,:64]!
+ add r0, r0, #8
+ vst1.16 {d23}, [r0,:64]!
+ vst1.16 {d27}, [r0,:64]!
+ vst1.16 {d31}, [r0,:64]!
+ vmov d28, d16
+ vmov d29, d17
+ vmov d30, d18
+ vmov d31, d19
+ pop {pc}
+endfunc
+
+@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+@ load the destination pixels (from a similar 4x16 slice), add and store back.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+@ r3 = slice offset
+function \txfm\()16_1d_4x16_pass2_neon
+ push {lr}
+ mov r12, #32
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+ cmp r3, #0
+ beq 1f
+.irp i, 28, 29, 30, 31
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+1:
+
+ add r3, r0, r1
+ lsl r1, r1, #1
+ bl \txfm\()16
+
+ load_add_store q8, q9, q10, q11
+ load_add_store q12, q13, q14, q15
+
+ pop {pc}
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+@ This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+ .short 0, 10, 38, 89
+endconst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp r3, #1
+ beq idct16x16_dc_add_neon
+.endif
+ push {r4-r8,lr}
+.ifnc \txfm1\()_\txfm2,idct_idct
+ vpush {q4-q7}
+.endif
+
+ @ Align the stack, allocate a temp buffer
+T mov r7, sp
+T and r7, r7, #15
+A and r7, sp, #15
+ add r7, r7, #512
+ sub sp, sp, r7
+
+ mov r4, r0
+ mov r5, r1
+ mov r6, r2
+
+.ifc \txfm1,idct
+ movrel r12, idct_coeffs
+ vld1.16 {q0-q1}, [r12,:128]
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ cmp r3, #10
+ ble idct16x16_quarter_add_neon
+ cmp r3, #38
+ ble idct16x16_half_add_neon
+
+ movrel r8, min_eob_idct_idct_16 + 2
+.endif
+
+.irp i, 0, 4, 8, 12
+ add r0, sp, #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+ ldrh_post r1, r8, #2
+ cmp r3, r1
+ it le
+ movle r1, #(16 - \i)/4
+ ble 1f
+.endif
+.endif
+ mov r1, #\i
+ add r2, r6, #(\i*2)
+ bl \txfm1\()16_1d_4x16_pass1_neon
+.endr
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ b 3f
+1:
+ @ For all-zero slices in pass 1, set d28-d31 to zero, for the in-register
+ @ passthrough of coefficients to pass 2 and clear the end of the temp buffer
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+2:
+ subs r1, r1, #1
+.rept 4
+ vst1.16 {q14-q15}, [r0,:128]!
+.endr
+ bne 2b
+3:
+.endif
+
+.ifc \txfm1\()_\txfm2,iadst_idct
+ movrel r12, idct_coeffs
+ vld1.16 {q0-q1}, [r12,:128]
+.endif
+.irp i, 0, 4, 8, 12
+ add r0, r4, #(\i)
+ mov r1, r5
+ add r2, sp, #(\i*2)
+ mov r3, #\i
+ bl \txfm2\()16_1d_4x16_pass2_neon
+.endr
+
+ add sp, sp, r7
+.ifnc \txfm1\()_\txfm2,idct_idct
+ vpop {q4-q7}
+.endif
+ pop {r4-r8,pc}
+endfunc
+.endm
+
+itxfm_func16x16 idct, idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct, iadst
+itxfm_func16x16 iadst, iadst
+.ltorg
+
+function idct16_1d_4x16_pass1_quarter_neon
+ push {lr}
+ mov r12, #32
+ vmov.s16 q2, #0
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d4}, [r2,:64], r12
+.endr
+
+ bl idct16_quarter
+
+ @ Do four 4x4 transposes. Originally, d16-d31 contain the
+ @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+ @ contain the transposed 4x4 blocks.
+ transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ @ Store the transposed 4x4 blocks horizontally.
+ @ The first 4x4 block is kept in registers for the second pass,
+ @ store the rest in the temp buffer.
+ add r0, r0, #8
+ vst1.16 {d20}, [r0,:64]!
+ vst1.16 {d24}, [r0,:64]!
+ vst1.16 {d28}, [r0,:64]!
+ add r0, r0, #8
+ vst1.16 {d21}, [r0,:64]!
+ vst1.16 {d25}, [r0,:64]!
+ vst1.16 {d29}, [r0,:64]!
+ add r0, r0, #8
+ vst1.16 {d22}, [r0,:64]!
+ vst1.16 {d26}, [r0,:64]!
+ vst1.16 {d30}, [r0,:64]!
+ add r0, r0, #8
+ vst1.16 {d23}, [r0,:64]!
+ vst1.16 {d27}, [r0,:64]!
+ vst1.16 {d31}, [r0,:64]!
+ pop {pc}
+endfunc
+
+function idct16_1d_4x16_pass2_quarter_neon
+ push {lr}
+ @ Only load the top 4 lines, and only do it for the later slices.
+ @ For the first slice, d16-d19 is kept in registers from the first pass.
+ cmp r3, #0
+ beq 1f
+ mov r12, #32
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+1:
+
+ add r3, r0, r1
+ lsl r1, r1, #1
+ bl idct16_quarter
+
+ load_add_store q8, q9, q10, q11
+ load_add_store q12, q13, q14, q15
+
+ pop {pc}
+endfunc
+
+function idct16_1d_4x16_pass1_half_neon
+ push {lr}
+ mov r12, #32
+ vmov.s16 q2, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d4}, [r2,:64], r12
+.endr
+
+ bl idct16_half
+
+ @ Do four 4x4 transposes. Originally, d16-d31 contain the
+ @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+ @ contain the transposed 4x4 blocks.
+ transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ @ Store the transposed 4x4 blocks horizontally.
+ cmp r1, #4
+ beq 1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ pop {pc}
+1:
+ @ Special case: For the second input column (r1 == 4),
+ @ which would be stored as the second row in the temp buffer,
+ @ don't store the first 4x4 block, but keep it in registers
+ @ for the first slice of the second pass (where it is the
+ @ second 4x4 block).
+ add r0, r0, #8
+ vst1.16 {d20}, [r0,:64]!
+ vst1.16 {d24}, [r0,:64]!
+ vst1.16 {d28}, [r0,:64]!
+ add r0, r0, #8
+ vst1.16 {d21}, [r0,:64]!
+ vst1.16 {d25}, [r0,:64]!
+ vst1.16 {d29}, [r0,:64]!
+ add r0, r0, #8
+ vst1.16 {d22}, [r0,:64]!
+ vst1.16 {d26}, [r0,:64]!
+ vst1.16 {d30}, [r0,:64]!
+ add r0, r0, #8
+ vst1.16 {d23}, [r0,:64]!
+ vst1.16 {d27}, [r0,:64]!
+ vst1.16 {d31}, [r0,:64]!
+ vmov d20, d16
+ vmov d21, d17
+ vmov d22, d18
+ vmov d23, d19
+ pop {pc}
+endfunc
+
+function idct16_1d_4x16_pass2_half_neon
+ push {lr}
+ mov r12, #32
+ cmp r3, #0
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+ beq 1f
+.irp i, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+1:
+
+ add r3, r0, r1
+ lsl r1, r1, #1
+ bl idct16_half
+
+ load_add_store q8, q9, q10, q11
+ load_add_store q12, q13, q14, q15
+
+ pop {pc}
+endfunc
+.purgem load_add_store
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_neon
+ add r0, sp, #(0*32)
+ mov r1, #0
+ add r2, r6, #(0*2)
+ bl idct16_1d_4x16_pass1_\size\()_neon
+.ifc \size,half
+ add r0, sp, #(4*32)
+ mov r1, #4
+ add r2, r6, #(4*2)
+ bl idct16_1d_4x16_pass1_\size\()_neon
+.endif
+.irp i, 0, 4, 8, 12
+ add r0, r4, #(\i)
+ mov r1, r5
+ add r2, sp, #(\i*2)
+ mov r3, #\i
+ bl idct16_1d_4x16_pass2_\size\()_neon
+.endr
+
+ add sp, sp, r7
+ pop {r4-r8,pc}
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+ movrel r12, idct_coeffs
+ vld1.16 {d0}, [r12,:64]
+
+ vmov.i16 q2, #0
+
+ vld1.16 {d16[]}, [r2,:16]
+ vmull.s16 q8, d16, d0[0]
+ vrshrn.s32 d16, q8, #14
+ vmull.s16 q8, d16, d0[0]
+ vrshrn.s32 d16, q8, #14
+ vdup.16 q8, d16[0]
+ vst1.16 {d4[0]}, [r2,:16]
+
+ vrshr.s16 q8, q8, #6
+
+ mov r3, r0
+ mov r12, #32
+1:
+ @ Loop to add the constant from q8 into all 32x32 outputs
+ subs r12, r12, #2
+ vld1.8 {q0-q1}, [r0,:128], r1
+ vaddw.u8 q9, q8, d0
+ vaddw.u8 q10, q8, d1
+ vld1.8 {q2-q3}, [r0,:128], r1
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q12, q8, d3
+ vaddw.u8 q13, q8, d4
+ vaddw.u8 q14, q8, d5
+ vaddw.u8 q15, q8, d6
+ vqmovun.s16 d0, q9
+ vaddw.u8 q9, q8, d7
+ vqmovun.s16 d1, q10
+ vqmovun.s16 d2, q11
+ vqmovun.s16 d3, q12
+ vqmovun.s16 d4, q13
+ vqmovun.s16 d5, q14
+ vst1.8 {q0-q1}, [r3,:128], r1
+ vqmovun.s16 d6, q15
+ vqmovun.s16 d7, q9
+ vst1.8 {q2-q3}, [r3,:128], r1
+ bne 1b
+
+ bx lr
+endfunc
+
+.macro idct32_end
+ butterfly d16, d9, d8, d9 @ d16 = t16a, d9 = t19a
+ butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18
+ butterfly d18, d10, d11, d10 @ d18 = t23a, d10 = t20a
+ butterfly d19, d21, d22, d21 @ d19 = t22, d21 = t21
+ butterfly d8, d28, d28, d30 @ d8 = t24a, d28 = t27a
+ butterfly d23, d26, d25, d26 @ d23 = t25, d26 = t26
+ butterfly d11, d29, d29, d31 @ d11 = t31a, d29 = t28a
+ butterfly d22, d27, d24, d27 @ d22 = t30, d27 = t29
+
+ mbutterfly d27, d20, d0[2], d0[3], q12, q15 @ d27 = t18a, d20 = t29a
+ mbutterfly d29, d9, d0[2], d0[3], q12, q15 @ d29 = t19, d5 = t28
+ mbutterfly d28, d10, d0[2], d0[3], q12, q15, neg=1 @ d28 = t27, d6 = t20
+ mbutterfly d26, d21, d0[2], d0[3], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
+
+ butterfly d31, d24, d11, d8 @ d31 = t31, d24 = t24
+ butterfly d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
+ butterfly_r d23, d16, d16, d18 @ d23 = t23, d16 = t16
+ butterfly_r d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
+ butterfly d18, d21, d27, d21 @ d18 = t18, d21 = t21
+ butterfly_r d27, d28, d9, d28 @ d27 = t27a, d28 = t28a
+ butterfly d8, d26, d20, d26 @ d8 = t29, d26 = t26
+ butterfly d19, d20, d29, d10 @ d19 = t19a, d20 = t20
+ vmov d29, d8 @ d29 = t29
+
+ mbutterfly0 d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27, d20 = t20
+ mbutterfly0 d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
+ mbutterfly0 d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25, d22 = t22
+ mbutterfly0 d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
+ bx lr
+.endm
+
+function idct32_odd
+ mbutterfly d16, d31, d4[0], d4[1], q4, q5 @ d16 = t16a, d31 = t31a
+ mbutterfly d24, d23, d4[2], d4[3], q4, q5 @ d24 = t17a, d23 = t30a
+ mbutterfly d20, d27, d5[0], d5[1], q4, q5 @ d20 = t18a, d27 = t29a
+ mbutterfly d28, d19, d5[2], d5[3], q4, q5 @ d28 = t19a, d19 = t28a
+ mbutterfly d18, d29, d6[0], d6[1], q4, q5 @ d18 = t20a, d29 = t27a
+ mbutterfly d26, d21, d6[2], d6[3], q4, q5 @ d26 = t21a, d21 = t26a
+ mbutterfly d22, d25, d7[0], d7[1], q4, q5 @ d22 = t22a, d25 = t25a
+ mbutterfly d30, d17, d7[2], d7[3], q4, q5 @ d30 = t23a, d17 = t24a
+
+ butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17
+ butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18
+ butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21
+ butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22
+ butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25
+ butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26
+ butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30
+ butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+ mbutterfly d23, d24, d1[0], d1[1], q8, q9 @ d23 = t17a, d24 = t30a
+ mbutterfly d27, d20, d1[0], d1[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+ mbutterfly d21, d26, d1[2], d1[3], q8, q9 @ d21 = t21a, d26 = t26a
+ mbutterfly d25, d22, d1[2], d1[3], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+ idct32_end
+endfunc
+
+function idct32_odd_half
+ mbutterfly_h1 d16, d31, d4[0], d4[1], q4, q5 @ d16 = t16a, d31 = t31a
+ mbutterfly_h2 d24, d23, d4[2], d4[3], q4, q5 @ d24 = t17a, d23 = t30a
+ mbutterfly_h1 d20, d27, d5[0], d5[1], q4, q5 @ d20 = t18a, d27 = t29a
+ mbutterfly_h2 d28, d19, d5[2], d5[3], q4, q5 @ d28 = t19a, d19 = t28a
+ mbutterfly_h1 d18, d29, d6[0], d6[1], q4, q5 @ d18 = t20a, d29 = t27a
+ mbutterfly_h2 d26, d21, d6[2], d6[3], q4, q5 @ d26 = t21a, d21 = t26a
+ mbutterfly_h1 d22, d25, d7[0], d7[1], q4, q5 @ d22 = t22a, d25 = t25a
+ mbutterfly_h2 d30, d17, d7[2], d7[3], q4, q5 @ d30 = t23a, d17 = t24a
+
+ butterfly d8, d24, d16, d24 @ d8 = t16, d24 = t17
+ butterfly d9, d20, d28, d20 @ d9 = t19, d20 = t18
+ butterfly d10, d26, d18, d26 @ d10 = t20, d26 = t21
+ butterfly d11, d22, d30, d22 @ d11 = t23, d22 = t22
+ butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25
+ butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26
+ butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30
+ butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+ mbutterfly d23, d24, d1[0], d1[1], q8, q9 @ d23 = t17a, d24 = t30a
+ mbutterfly d27, d20, d1[0], d1[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+ mbutterfly d21, d26, d1[2], d1[3], q8, q9 @ d21 = t21a, d26 = t26a
+ mbutterfly d25, d22, d1[2], d1[3], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+
+ idct32_end
+endfunc
+
+function idct32_odd_quarter
+ vmull.s16 q4, d16, d4[0]
+ vmull.s16 q14, d19, d5[3]
+ vmull.s16 q15, d16, d4[1]
+ vmull.s16 q11, d17, d7[2]
+ vmull.s16 q5, d17, d7[3]
+ vmull.s16 q13, d19, d5[2]
+ vmull.s16 q10, d18, d6[0]
+ vmull.s16 q12, d18, d6[1]
+
+ vneg.s32 q14, q14
+ vneg.s32 q5, q5
+
+ vrshrn.s32 d8, q4, #14
+ vrshrn.s32 d9, q14, #14
+ vrshrn.s32 d29, q15, #14
+ vrshrn.s32 d28, q11, #14
+ vrshrn.s32 d11, q5, #14
+ vrshrn.s32 d31, q13, #14
+ vrshrn.s32 d10, q10, #14
+ vrshrn.s32 d30, q12, #14
+
+ mbutterfly_l q8, q9, d29, d8, d1[0], d1[1]
+ mbutterfly_l q13, q10, d31, d9, d1[0], d1[1]
+ vrshrn.s32 d23, q8, #14
+ vrshrn.s32 d24, q9, #14
+ vneg.s32 q10, q10
+ vrshrn.s32 d27, q13, #14
+ vrshrn.s32 d20, q10, #14
+ mbutterfly_l q8, q9, d30, d10, d1[2], d1[3]
+ vrshrn.s32 d21, q8, #14
+ vrshrn.s32 d26, q9, #14
+ mbutterfly_l q8, q9, d28, d11, d1[2], d1[3]
+ vrshrn.s32 d25, q8, #14
+ vneg.s32 q9, q9
+ vrshrn.s32 d22, q9, #14
+
+ idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
+@ We don't have register space to do a single pass IDCT of 4x32 though,
+@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
+@ a normal IDCT16 with every other input component (the even ones, with
+@ each output written twice), followed by a separate 16-point IDCT
+@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
+@ r0 = dst (temp buffer)
+@ r1 = unused
+@ r2 = src
+function idct32_1d_4x32_pass1\suffix\()_neon
+ push {lr}
+
+ @ idct16 clobbers q2-q3 (since it doesn't clobber q4-q7 at all
+ @ when doing the normal 16x16 idct), so move the idct32_odd coeffs
+ @ to q4-q5
+ vmov q4, q2
+ vmov q5, q3
+
+ @ Double stride of the input, since we only read every other line
+ mov r12, #128
+ vmov.s16 d4, #0
+
+ @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d4}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d4}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d4}, [r2,:64], r12
+.endr
+.endif
+
+ bl idct16\suffix
+
+ @ Move the idct32_odd coeffs back into q2-q3 for idct32_odd;
+ @ the constants for a vmul with a lane must be in q0-q3.
+ vmov q2, q4
+ vmov q3, q5
+
+ @ Do four 4x4 transposes. Originally, d16-d31 contain the
+ @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+ @ contain the transposed 4x4 blocks.
+ transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ @ Store the registers a, b, c, d horizontally, followed
+ @ by the same registers d, c, b, a mirrored.
+.macro store_rev a, b, c, d
+.irp i, \a, \b, \c, \d
+ vst1.16 {d\i}, [r0,:64]!
+ vrev64.16 d\i, d\i
+.endr
+.irp i, \d, \c, \b, \a
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+.endm
+ store_rev 16, 20, 24, 28
+ store_rev 17, 21, 25, 29
+ store_rev 18, 22, 26, 30
+ store_rev 19, 23, 27, 31
+ sub r0, r0, #256
+.purgem store_rev
+
+ @ Move r2 back to the start of the input, and move
+ @ to the first odd row
+.ifb \suffix
+ sub r2, r2, r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+ sub r2, r2, r12, lsl #2
+.endif
+.ifc \suffix,_half
+ sub r2, r2, r12, lsl #3
+.endif
+ add r2, r2, #64
+
+ vmov.s16 d8, #0
+ @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d8}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d8}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d8}, [r2,:64], r12
+.endr
+.endif
+
+ bl idct32_odd\suffix
+
+ transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+
+ @ Store the registers a, b, c, d horizontally,
+ @ adding into the output first, and then mirrored, subtracted
+ @ from the output.
+.macro store_rev a, b, c, d
+.irp i, \a, \b, \c, \d
+ vld1.16 {d8}, [r0,:64]
+ vadd.s16 d8, d8, d\i
+ vst1.16 {d8}, [r0,:64]!
+ vrev64.16 d\i, d\i
+.endr
+.irp i, \d, \c, \b, \a
+ vld1.16 {d8}, [r0,:64]
+ vsub.s16 d8, d8, d\i
+ vst1.16 {d8}, [r0,:64]!
+.endr
+.endm
+
+ store_rev 31, 27, 23, 19
+ store_rev 30, 26, 22, 18
+ store_rev 29, 25, 21, 17
+ store_rev 28, 24, 20, 16
+.purgem store_rev
+ pop {pc}
+endfunc
+.ltorg
+
+@ This is mostly the same as 4x32_pass1, but without the transpose,
+@ and use the source as temp buffer between the two idct passes, and
+@ add into the destination.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+function idct32_1d_4x32_pass2\suffix\()_neon
+ push {lr}
+ vmov q4, q2
+ vmov q5, q3
+
+ mov r12, #128
+ @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #3
+.endif
+
+ bl idct16\suffix
+
+ vmov q2, q4
+ vmov q3, q5
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vst1.16 {d\i}, [r2,:64], r12
+.endr
+
+ sub r2, r2, r12, lsl #4
+ add r2, r2, #64
+
+ @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+ sub r2, r2, r12, lsl #3
+.endif
+ sub r2, r2, #64
+
+ bl idct32_odd\suffix
+
+ mov r12, #128
+.macro load_acc_store a, b, c, d, neg=0
+ vld1.16 {d8}, [r2,:64], r12
+ vld1.16 {d9}, [r2,:64], r12
+.if \neg == 0
+ vadd.s16 d8, d8, d\a
+ vld1.16 {d10}, [r2,:64], r12
+ vadd.s16 d9, d9, d\b
+ vld1.16 {d11}, [r2,:64], r12
+ vadd.s16 d10, d10, d\c
+ vadd.s16 d11, d11, d\d
+.else
+ vsub.s16 d8, d8, d\a
+ vld1.16 {d10}, [r2,:64], r12
+ vsub.s16 d9, d9, d\b
+ vld1.16 {d11}, [r2,:64], r12
+ vsub.s16 d10, d10, d\c
+ vsub.s16 d11, d11, d\d
+.endif
+ vld1.32 {d12[]}, [r0,:32], r1
+ vld1.32 {d12[1]}, [r0,:32], r1
+ vrshr.s16 q4, q4, #6
+ vld1.32 {d13[]}, [r0,:32], r1
+ vrshr.s16 q5, q5, #6
+ vld1.32 {d13[1]}, [r0,:32], r1
+ sub r0, r0, r1, lsl #2
+ vaddw.u8 q4, q4, d12
+ vaddw.u8 q5, q5, d13
+ vqmovun.s16 d8, q4
+ vqmovun.s16 d9, q5
+ vst1.32 {d8[0]}, [r0,:32], r1
+ vst1.32 {d8[1]}, [r0,:32], r1
+ vst1.32 {d9[0]}, [r0,:32], r1
+ vst1.32 {d9[1]}, [r0,:32], r1
+.endm
+ load_acc_store 31, 30, 29, 28
+ load_acc_store 27, 26, 25, 24
+ load_acc_store 23, 22, 21, 20
+ load_acc_store 19, 18, 17, 16
+ sub r2, r2, r12
+ neg r12, r12
+ load_acc_store 16, 17, 18, 19, 1
+ load_acc_store 20, 21, 22, 23, 1
+ load_acc_store 24, 25, 26, 27, 1
+ load_acc_store 28, 29, 30, 31, 1
+.purgem load_acc_store
+ pop {pc}
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+ .short 0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
+function ff_vp9_idct_idct_32x32_add_neon, export=1
+ cmp r3, #1
+ beq idct32x32_dc_add_neon
+ push {r4-r8,lr}
+ vpush {q4-q6}
+
+ @ Align the stack, allocate a temp buffer
+T mov r7, sp
+T and r7, r7, #15
+A and r7, sp, #15
+ add r7, r7, #2048
+ sub sp, sp, r7
+
+ mov r4, r0
+ mov r5, r1
+ mov r6, r2
+
+ movrel r12, idct_coeffs
+ vld1.16 {q0-q1}, [r12,:128]!
+ vld1.16 {q2-q3}, [r12,:128]
+
+ cmp r3, #34
+ ble idct32x32_quarter_add_neon
+ cmp r3, #135
+ ble idct32x32_half_add_neon
+
+ movrel r8, min_eob_idct_idct_32 + 2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r0, sp, #(\i*64)
+.if \i > 0
+ ldrh_post r1, r8, #2
+ cmp r3, r1
+ it le
+ movle r1, #(32 - \i)/2
+ ble 1f
+.endif
+ add r2, r6, #(\i*2)
+ bl idct32_1d_4x32_pass1_neon
+.endr
+ b 3f
+
+1:
+ @ Write zeros to the temp buffer for pass 2
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+2:
+ subs r1, r1, #1
+.rept 4
+ vst1.16 {q14-q15}, [r0,:128]!
+.endr
+ bne 2b
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r0, r4, #(\i)
+ mov r1, r5
+ add r2, sp, #(\i*2)
+ bl idct32_1d_4x32_pass2_neon
+.endr
+
+ add sp, sp, r7
+ vpop {q4-q6}
+ pop {r4-r8,pc}
+endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_neon
+.irp i, 0, 4
+ add r0, sp, #(\i*64)
+.ifc \size,quarter
+.if \i == 4
+ cmp r3, #9
+ ble 1f
+.endif
+.endif
+ add r2, r6, #(\i*2)
+ bl idct32_1d_4x32_pass1_\size\()_neon
+.endr
+
+.ifc \size,half
+.irp i, 8, 12
+ add r0, sp, #(\i*64)
+.if \i == 12
+ cmp r3, #70
+ ble 1f
+.endif
+ add r2, r6, #(\i*2)
+ bl idct32_1d_4x32_pass1_\size\()_neon
+.endr
+.endif
+ b 3f
+
+1:
+ @ Write zeros to the temp buffer for pass 2
+ vmov.i16 q14, #0
+ vmov.i16 q15, #0
+.rept 8
+ vst1.16 {q14-q15}, [r0,:128]!
+.endr
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r0, r4, #(\i)
+ mov r1, r5
+ add r2, sp, #(\i*2)
+ bl idct32_1d_4x32_pass2_\size\()_neon
+.endr
+
+ add sp, sp, r7
+ vpop {q4-q6}
+ pop {r4-r8,pc}
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half
diff --git a/libavcodec/arm/vp9lpf_16bpp_neon.S b/libavcodec/arm/vp9lpf_16bpp_neon.S
new file mode 100644
index 0000000000..7d2571dcc0
--- /dev/null
+++ b/libavcodec/arm/vp9lpf_16bpp_neon.S
@@ -0,0 +1,1044 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro transpose16_q_8x8 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+ vswp \r1, \r8 @ vtrn.64 \rq0, \rq4
+ vswp \r3, \r10 @ vtrn.64 \rq1, \rq5
+ vswp \r5, \r12 @ vtrn.64 \rq2, \rq6
+ vswp \r7, \r14 @ vtrn.64 \rq3, \rq7
+ vtrn.32 \rq0, \rq2
+ vtrn.32 \rq1, \rq3
+ vtrn.32 \rq4, \rq6
+ vtrn.32 \rq5, \rq7
+ vtrn.16 \rq0, \rq1
+ vtrn.16 \rq2, \rq3
+ vtrn.16 \rq4, \rq5
+ vtrn.16 \rq6, \rq7
+.endm
+
+.macro transpose16_4x4 r0, r1, r2, r3
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+@ Do a 4x4 transpose, using q registers for the subtransposes that don't
+@ need to address the indiviudal d registers.
+@ r0,r1 == rq0, r2,r3 == rq1
+.macro transpose16_q_4x4 rq0, rq1, r0, r1, r2, r3
+ vtrn.32 \rq0, \rq1
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+.endm
+
+@ The input to and output from this macro is in the registers q8-q15,
+@ and q0-q7 are used as scratch registers.
+@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
+.macro loop_filter_q wd
+ vdup.u16 q0, r2 @ E
+ vdup.u16 q1, r3 @ I
+
+ vabd.u16 q2, q8, q9 @ abs(p3 - p2)
+ vabd.u16 q3, q9, q10 @ abs(p2 - p1)
+ vabd.u16 q4, q10, q11 @ abs(p1 - p0)
+ vabd.u16 q5, q12, q13 @ abs(q0 - q1)
+ vabd.u16 q6, q13, q14 @ abs(q1 - q2)
+ vabd.u16 q7, q14, q15 @ abs(q2 - q3)
+ vmax.u16 q2, q2, q3
+ vmax.u16 q3, q4, q5
+ vmax.u16 q4, q6, q7
+ vabd.u16 q5, q11, q12 @ abs(p0 - q0)
+ vmax.u16 q2, q2, q3
+ vadd.u16 q5, q5, q5 @ abs(p0 - q0) * 2
+ vabd.u16 q6, q10, q13 @ abs(p1 - q1)
+ vmax.u16 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3))
+ vshr.u16 q6, q6, #1
+ vcle.u16 q2, q2, q1 @ max(abs()) <= I
+ vadd.u16 q5, q5, q6 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+ vcle.u16 q5, q5, q0
+ vand q2, q2, q5 @ fm
+
+ vmovn.u16 d10, q2
+ vmov r8, r9, d10
+ orrs r8, r8, r9
+ @ If no pixels need filtering, just exit as soon as possible
+ beq 9f
+
+.if \wd >= 8
+ vdup.u16 q0, r5
+
+ vabd.u16 q1, q8, q11 @ abs(p3 - p0)
+ vabd.u16 q3, q9, q11 @ abs(p2 - p0)
+ vabd.u16 q4, q10, q11 @ abs(p1 - p0)
+ vabd.u16 q5, q13, q12 @ abs(q1 - q0)
+ vabd.u16 q6, q14, q12 @ abs(q2 - q0)
+ vabd.u16 q7, q15, q12 @ abs(q3 - q0)
+ vmax.u16 q1, q1, q3
+ vmax.u16 q4, q4, q5
+ vmax.u16 q6, q6, q7
+ @ The rest of the calculation of flat8in is interleaved below
+.endif
+
+ @ Calculate the normal inner loop filter for 2 or 4 pixels
+ vabd.u16 q3, q10, q11 @ abs(p1 - p0)
+.if \wd == 8
+ vmax.u16 q1, q1, q4
+.endif
+ vabd.u16 q4, q13, q12 @ abs(q1 - q0)
+.if \wd == 8
+ vmax.u16 q1, q1, q6
+.endif
+
+ vsub.u16 q5, q10, q13 @ p1 - q1
+ vmax.u16 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0))
+ vdup.u16 q4, r4 @ H
+ vsub.u16 q6, q12, q11 @ q0 - p0
+.if \wd == 8
+ vcle.u16 q1, q1, q0 @ flat8in
+.endif
+ vdup.u16 q0, r6 @ left shift for saturation
+ vcle.u16 q3, q3, q4 @ !hev
+.if \wd == 8
+ vand q1, q1, q2 @ flat8in && fm
+.endif
+ vneg.s16 q4, q0 @ negative left shift after saturation
+ vqshl.s16 q5, q5, q0
+.if \wd == 8
+ vbic q2, q2, q1 @ fm && !flat8in
+.endif
+ vmov.s16 q7, #3
+ vand q3, q3, q2 @ !hev && fm && !flat8in
+ vshl.s16 q5, q5, q4 @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+ vmul.s16 q6, q6, q7 @ 3 * (q0 - p0)
+ vbic q5, q5, q3 @ if (!hev) av_clip_int2p = 0
+ vadd.s16 q6, q6, q5 @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
+ vmov.s16 q5, #4
+ vqshl.s16 q6, q6, q0
+ vmov.s16 q0, #3
+ vshl.s16 q6, q6, q4 @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+ vdup.u16 q4, r7 @ max pixel value
+
+ vshr.u16 q4, q4, #1 @ (1 << (BIT_DEPTH - 1)) - 1)
+
+ vadd.s16 q5, q6, q5 @ f + 4
+ vadd.s16 q0, q6, q0 @ f + 3
+ vmov.s16 q6, #0
+ vmin.s16 q5, q5, q4 @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+ vmin.s16 q0, q0, q4 @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+ vdup.u16 q4, r7 @ max pixel value
+ vshr.s16 q5, q5, #3 @ f1
+ vshr.s16 q0, q0, #3 @ f2
+
+ vadd.s16 q0, q11, q0 @ p0 + f2
+ vsub.s16 q7, q12, q5 @ q0 - f1
+ vmin.s16 q0, q0, q4
+ vmin.s16 q7, q7, q4
+ vrshr.s16 q5, q5, #1 @ f = (f1 + 1) >> 1
+ vmax.s16 q0, q0, q6 @ out p0
+ vmax.s16 q7, q7, q6 @ out q0
+ vbit q11, q0, q2 @ if (fm && !flat8in)
+ vbit q12, q7, q2
+.if \wd >= 8
+ vmovn.u16 d4, q1
+.endif
+
+ vadd.s16 q0, q10, q5 @ p1 + f
+ vsub.s16 q7, q13, q5 @ q1 - f
+.if \wd >= 8
+ vmov r8, r9, d4
+.endif
+ vmin.s16 q0, q0, q4
+ vmin.s16 q7, q7, q4
+.if \wd >= 8
+ orrs r8, r8, r9
+.endif
+ vmax.s16 q0, q0, q6 @ out p1
+ vmax.s16 q7, q7, q6 @ out q1
+ vbit q10, q0, q3 @ if (!hev && fm && !flat8in)
+ vbit q13, q7, q3
+
+.if \wd >= 8
+ @ If no pixels need flat8in, jump to a writeout of the inner 4 pixels
+ beq 6f
+
+ @ flat8in
+ vadd.u16 q2, q8, q9
+ vadd.u16 q3, q10, q13
+ vadd.u16 q4, q8, q10
+ vadd.u16 q5, q11, q14
+ vadd.u16 q0, q2, q2
+ vadd.u16 q0, q0, q11
+ vadd.u16 q0, q0, q12
+ vadd.u16 q0, q0, q4
+ vsub.s16 q3, q3, q2
+ vsub.s16 q5, q5, q4
+ vrshr.u16 q6, q0, #3 @ out p2
+
+ vadd.u16 q0, q0, q3
+ vadd.u16 q2, q8, q11
+ vadd.u16 q3, q12, q15
+ vrshr.u16 q7, q0, #3 @ out p1
+
+ vadd.u16 q0, q0, q5
+ vsub.s16 q3, q3, q2
+ vadd.u16 q4, q9, q12
+ vbit q9, q6, q1
+ vadd.u16 q5, q13, q15
+ vrshr.u16 q6, q0, #3 @ out p0
+
+ vadd.u16 q0, q0, q3
+ vsub.s16 q5, q5, q4
+ vadd.u16 q2, q10, q13
+ vbit q10, q7, q1
+ vadd.u16 q3, q14, q15
+ vrshr.u16 q7, q0, #3 @ out q0
+
+ vadd.u16 q0, q0, q5
+ vsub.s16 q3, q3, q2
+ vbit q11, q6, q1
+ vrshr.u16 q6, q0, #3 @ out q1
+
+ vadd.u16 q0, q0, q3
+ vbit q12, q7, q1
+ vrshr.u16 q7, q0, #3 @ out q2
+ vbit q13, q6, q1
+ vbit q14, q7, q1
+.endif
+.endm
+
+@ The input to and output from this macro is in the registers d16-d31,
+@ and d0-d7 are used as scratch registers.
+@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
+@ Depending on the width of the loop filter, we either use d16-d19
+@ and d28-d31 as temp registers, or d8-d15.
+@ In practice, this is only ever instantiated once, so the macro parameters
+@ could be hardcoded, but keeping them as is, to keep similarities to the
+@ 8 bpp and aarch64 versions.
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+ vdup.u16 d0, r2 @ E
+ vdup.u16 d2, r3 @ I
+
+ vabd.u16 d4, d20, d21 @ abs(p3 - p2)
+ vabd.u16 d5, d21, d22 @ abs(p2 - p1)
+ vabd.u16 d6, d22, d23 @ abs(p1 - p0)
+ vabd.u16 d7, d24, d25 @ abs(q0 - q1)
+ vabd.u16 \tmp1, d25, d26 @ abs(q1 - q2)
+ vabd.u16 \tmp2, d26, d27 @ abs(q2 - q3)
+ vmax.u16 d4, d4, d5
+ vmax.u16 d5, d6, d7
+ vmax.u16 \tmp1, \tmp1, \tmp2
+ vabd.u16 d6, d23, d24 @ abs(p0 - q0)
+ vmax.u16 d4, d4, d5
+ vadd.u16 d6, d6, d6 @ abs(p0 - q0) * 2
+ vabd.u16 d5, d22, d25 @ abs(p1 - q1)
+ vmax.u16 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
+ vshr.u16 d5, d5, #1
+ vcle.u16 d4, d4, d2 @ max(abs()) <= I
+ vadd.u16 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+ vcle.u16 d6, d6, d0
+ vand d4, d4, d6 @ fm
+
+ vdup.u16 d3, r4 @ H
+ vmov r8, r9, d4
+ orrs r8, r8, r9
+ @ If no pixels need filtering, just exit as soon as possible
+ beq 9f
+
+.if \wd >= 8
+ vdup.u16 d0, r5
+
+ vabd.u16 d6, d20, d23 @ abs(p3 - p0)
+ vabd.u16 d2, d21, d23 @ abs(p2 - p0)
+ vabd.u16 d1, d22, d23 @ abs(p1 - p0)
+ vabd.u16 \tmp1, d25, d24 @ abs(q1 - q0)
+ vabd.u16 \tmp2, d26, d24 @ abs(q2 - q0)
+ vabd.u16 \tmp3, d27, d24 @ abs(q3 - q0)
+ vmax.u16 d6, d6, d2
+ vmax.u16 d1, d1, \tmp1
+ vmax.u16 \tmp2, \tmp2, \tmp3
+.if \wd == 16
+ vabd.u16 d7, d16, d23 @ abs(p7 - p0)
+ vmax.u16 d6, d6, d1
+ vabd.u16 d2, d17, d23 @ abs(p6 - p0)
+ vmax.u16 d6, d6, \tmp2
+ vabd.u16 d1, d18, d23 @ abs(p5 - p0)
+ vcle.u16 d6, d6, d0 @ flat8in
+ vabd.u16 d8, d19, d23 @ abs(p4 - p0)
+ vand d6, d6, d4 @ flat8in && fm
+ vabd.u16 d9, d28, d24 @ abs(q4 - q0)
+ vbic d4, d4, d6 @ fm && !flat8in
+ vabd.u16 d10, d29, d24 @ abs(q5 - q0)
+ vabd.u16 d11, d30, d24 @ abs(q6 - q0)
+ vabd.u16 d12, d31, d24 @ abs(q7 - q0)
+
+ vmax.u16 d7, d7, d2
+ vmax.u16 d1, d1, d8
+ vmax.u16 d9, d9, d10
+ vmax.u16 d11, d11, d12
+ @ The rest of the calculation of flat8out is interleaved below
+.else
+ @ The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+ @ Calculate the normal inner loop filter for 2 or 4 pixels
+ vabd.u16 d5, d22, d23 @ abs(p1 - p0)
+.if \wd == 16
+ vmax.u16 d7, d7, d1
+ vmax.u16 d9, d9, d11
+.elseif \wd == 8
+ vmax.u16 d6, d6, d1
+.endif
+ vabd.u16 d1, d25, d24 @ abs(q1 - q0)
+.if \wd == 16
+ vmax.u16 d7, d7, d9
+.elseif \wd == 8
+ vmax.u16 d6, d6, \tmp2
+.endif
+ vdup.u16 \tmp2, r6 @ left shift for saturation
+ vsub.u16 \tmp1, d22, d25 @ p1 - q1
+ vneg.s16 \tmp6, \tmp2 @ negative left shift after saturation
+ vmax.u16 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0))
+ vsub.u16 \tmp3, d24, d23 @ q0 - p0
+ vmov.s16 \tmp5, #3
+.if \wd == 8
+ vcle.u16 d6, d6, d0 @ flat8in
+.endif
+ vcle.u16 d5, d5, d3 @ !hev
+.if \wd == 8
+ vand d6, d6, d4 @ flat8in && fm
+.endif
+ vqshl.s16 \tmp1, \tmp1, \tmp2
+.if \wd == 16
+ vcle.u16 d7, d7, d0 @ flat8out
+.elseif \wd == 8
+ vbic d4, d4, d6 @ fm && !flat8in
+.endif
+ vand d5, d5, d4 @ !hev && fm && !flat8in
+.if \wd == 16
+ vand d7, d7, d6 @ flat8out && flat8in && fm
+.endif
+ vshl.s16 \tmp1, \tmp1, \tmp6 @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+ vmul.s16 \tmp3, \tmp3, \tmp5 @ 3 * (q0 - p0)
+ vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int2p = 0
+ vmov.s16 d2, #4
+ vadd.s16 \tmp3, \tmp3, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
+ vmov.s16 d3, #3
+ vqshl.s16 \tmp1, \tmp3, \tmp2
+ vmov.s16 \tmp5, #0
+ vshl.s16 \tmp1, \tmp1, \tmp6 @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+ vdup.u16 \tmp6, r7 @ max pixel value
+.if \wd == 16
+ vbic d6, d6, d7 @ fm && flat8in && !flat8out
+.endif
+
+ vshr.u16 \tmp2, \tmp6, #1 @ (1 << (BIT_DEPTH - 1)) - 1
+
+ vadd.s16 \tmp3, \tmp1, d2 @ f + 4
+ vadd.s16 \tmp4, \tmp1, d3 @ f + 3
+ vmin.s16 \tmp3, \tmp3, \tmp2 @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+ vmin.s16 \tmp4, \tmp4, \tmp2 @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+ vshr.s16 \tmp3, \tmp3, #3 @ f1
+ vshr.s16 \tmp4, \tmp4, #3 @ f2
+
+ vadd.s16 d0, d23, \tmp4 @ p0 + f2
+ vsub.s16 d2, d24, \tmp3 @ q0 - f1
+ vmin.s16 d0, d0, \tmp6
+ vmin.s16 d2, d2, \tmp6
+ vrshr.s16 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1
+ vmax.s16 d0, d0, \tmp5 @ out p0
+ vmax.s16 d2, d2, \tmp5 @ out q0
+ vbit d23, d0, d4 @ if (fm && !flat8in)
+ vbit d24, d2, d4
+
+ vadd.s16 d0, d22, \tmp3 @ p1 + f
+ vsub.s16 d2, d25, \tmp3 @ q1 - f
+.if \wd >= 8
+ vmov r8, r9, d6
+.endif
+ vmin.s16 d0, d0, \tmp6
+ vmin.s16 d2, d2, \tmp6
+.if \wd >= 8
+ orrs r8, r8, r9
+.endif
+ vmax.s16 d0, d0, \tmp5 @ out p1
+ vmax.s16 d2, d2, \tmp5 @ out q1
+ vbit d22, d0, d5 @ if (!hev && fm && !flat8in)
+ vbit d25, d2, d5
+
+.if \wd >= 8
+ @ If no pixels need flat8in, jump to flat8out
+ @ (or to a writeout of the inner 4 pixels, for wd=8)
+ beq 6f
+
+ @ flat8in
+ vadd.u16 \tmp1, d20, d21
+ vadd.u16 \tmp3, d22, d25
+ vadd.u16 \tmp5, d20, d22
+ vadd.u16 \tmp7, d23, d26
+ vadd.u16 d0, \tmp1, \tmp1
+ vadd.u16 d0, d0, d23
+ vadd.u16 d0, d0, d24
+ vadd.u16 d0, d0, \tmp5
+ vsub.s16 \tmp3, \tmp3, \tmp1
+ vsub.s16 \tmp7, \tmp7, \tmp5
+ vrshr.u16 d2, d0, #3 @ out p2
+
+ vadd.u16 d0, d0, \tmp3
+ vadd.u16 \tmp1, d20, d23
+ vadd.u16 \tmp3, d24, d27
+ vrshr.u16 d3, d0, #3 @ out p1
+
+ vadd.u16 d0, d0, \tmp7
+ vsub.s16 \tmp3, \tmp3, \tmp1
+ vadd.u16 \tmp5, d21, d24
+ vadd.u16 \tmp7, d25, d27
+ vrshr.u16 d4, d0, #3 @ out p0
+
+ vadd.u16 d0, d0, \tmp3
+ vsub.s16 \tmp7, \tmp7, \tmp5
+ vadd.u16 \tmp1, d22, d25
+ vadd.u16 \tmp3, d26, d27
+ vrshr.u16 d5, d0, #3 @ out d0
+
+ vadd.u16 d0, d0, \tmp7
+ vsub.s16 \tmp3, \tmp3, \tmp1
+ vrshr.u16 \tmp5, d0, #3 @ out q1
+
+ vadd.u16 d0, d0, \tmp3
+ @ The output here is written back into the input registers. This doesn't
+ @ matter for the flat8out part below, since we only update those pixels
+ @ which won't be touched below.
+ vbit d21, d2, d6
+ vbit d22, d3, d6
+ vbit d23, d4, d6
+ vrshr.u16 \tmp6, d0, #3 @ out q2
+ vbit d24, d5, d6
+ vbit d25, \tmp5, d6
+ vbit d26, \tmp6, d6
+.endif
+.if \wd == 16
+6:
+ vorr d2, d6, d7
+ vmov r8, r9, d2
+ orrs r8, r8, r9
+ @ If no pixels needed flat8in nor flat8out, jump to a
+ @ writeout of the inner 4 pixels
+ beq 7f
+ vmov r8, r9, d7
+ orrs r8, r8, r9
+ @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+ beq 8f
+
+ @ flat8out
+ @ This writes all outputs into d2-d17 (skipping d6 and d16).
+ @ If this part is skipped, the output is read from d21-d26 (which is the input
+ @ to this section).
+ vshl.u16 d0, d16, #3 @ 8 * d16
+ vsub.u16 d0, d0, d16 @ 7 * d16
+ vadd.u16 d0, d0, d17
+ vadd.u16 d8, d17, d18
+ vadd.u16 d10, d19, d20
+ vadd.s16 d0, d0, d8
+ vadd.u16 d8, d16, d17
+ vadd.u16 d12, d21, d22
+ vadd.s16 d0, d0, d10
+ vadd.u16 d10, d18, d25
+ vadd.u16 d14, d23, d24
+ vsub.s16 d10, d10, d8
+ vadd.s16 d0, d0, d12
+ vadd.s16 d0, d0, d14
+ vadd.u16 d12, d16, d18
+ vadd.u16 d14, d19, d26
+ vrshr.u16 d2, d0, #4
+
+ vadd.s16 d0, d0, d10
+ vadd.u16 d8, d16, d19
+ vadd.u16 d10, d20, d27
+ vsub.s16 d14, d14, d12
+ vbif d2, d17, d7
+ vrshr.u16 d3, d0, #4
+
+ vadd.s16 d0, d0, d14
+ vadd.u16 d12, d16, d20
+ vadd.u16 d14, d21, d28
+ vsub.s16 d10, d10, d8
+ vbif d3, d18, d7
+ vrshr.u16 d4, d0, #4
+
+ vadd.s16 d0, d0, d10
+ vadd.u16 d8, d16, d21
+ vadd.u16 d10, d22, d29
+ vsub.s16 d14, d14, d12
+ vbif d4, d19, d7
+ vrshr.u16 d5, d0, #4
+
+ vadd.s16 d0, d0, d14
+ vadd.u16 d12, d16, d22
+ vadd.u16 d14, d23, d30
+ vsub.s16 d10, d10, d8
+ vbif d5, d20, d7
+ vrshr.u16 d6, d0, #4
+
+ vadd.s16 d0, d0, d10
+ vadd.u16 d10, d16, d23
+ vsub.s16 d14, d14, d12
+ vadd.u16 d12, d24, d31
+ vbif d6, d21, d7
+ vrshr.u16 d8, d0, #4
+
+ vadd.s16 d0, d0, d14
+ vsub.s16 d10, d12, d10
+ vadd.u16 d12, d17, d24
+ vadd.u16 d14, d25, d31
+ vbif d8, d22, d7
+ vrshr.u16 d9, d0, #4
+
+ vadd.s16 d0, d0, d10
+ vsub.s16 d14, d14, d12
+ vadd.u16 d12, d26, d31
+ vbif d9, d23, d7
+ vrshr.u16 d10, d0, #4
+
+ vadd.s16 d0, d0, d14
+ vadd.u16 d14, d18, d25
+ vadd.u16 d18, d19, d26
+ vsub.s16 d12, d12, d14
+ vadd.u16 d14, d27, d31
+ vbif d10, d24, d7
+ vrshr.u16 d11, d0, #4
+
+ vadd.s16 d0, d0, d12
+ vadd.u16 d12, d20, d27
+ vsub.s16 d14, d14, d18
+ vadd.u16 d18, d28, d31
+ vbif d11, d25, d7
+ vsub.s16 d18, d18, d12
+ vrshr.u16 d12, d0, #4
+
+ vadd.s16 d0, d0, d14
+ vadd.u16 d14, d21, d28
+ vadd.u16 d20, d29, d31
+ vbif d12, d26, d7
+ vrshr.u16 d13, d0, #4
+
+ vadd.s16 d0, d0, d18
+ vsub.s16 d20, d20, d14
+ vadd.u16 d18, d22, d29
+ vadd.u16 d22, d30, d31
+ vbif d13, d27, d7
+ vrshr.u16 d14, d0, #4
+
+ vadd.s16 d0, d0, d20
+ vsub.s16 d22, d22, d18
+ vbif d14, d28, d7
+ vrshr.u16 d15, d0, #4
+
+ vadd.s16 d0, d0, d22
+ vbif d15, d29, d7
+ vrshr.u16 d17, d0, #4
+ vbif d17, d30, d7
+.endif
+.endm
+
+.macro loop_filter_q_4
+ loop_filter_q 4
+.endm
+
+.macro loop_filter_q_8
+ loop_filter_q 8
+.endm
+
+.macro loop_filter_16
+ loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15
+.endm
+
+
+@ The public functions in this file have got the following signature:
+@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+.macro bpp_frontend func, bpp
+function ff_\func\()_\bpp\()_neon, export=1
+ push {r4-r9,lr}
+ ldr r4, [sp, #28]
+ vpush {q4-q7}
+ lsl r2, r2, #\bpp - 8
+ lsl r3, r3, #\bpp - 8
+ lsl r4, r4, #\bpp - 8
+ mov r5, #1 << (\bpp - 8)
+ mov r6, #16 - \bpp
+ movw r7, #((1 << \bpp) - 1)
+ bl \func\()_16_neon
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro bpp_frontends func
+ bpp_frontend \func, 10
+ bpp_frontend \func, 12
+.endm
+
+.macro bpp_frontend_rep func, suffix, int_suffix, rep, dir, bpp
+function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
+ push {r4-r9,lr}
+ ldr r4, [sp, #28]
+ vpush {q4-q7}
+ lsl r2, r2, #\bpp - 8
+ lsl r3, r3, #\bpp - 8
+ lsl r4, r4, #\bpp - 8
+ mov r5, #1 << (\bpp - 8)
+ mov r6, #16 - \bpp
+ movw r7, #((1 << \bpp) - 1)
+ bl \func\()_\int_suffix\()_16_neon
+.ifc \dir,h
+ add r0, r0, r1, lsl #2
+.else
+ add r0, r0, #8
+.endif
+ bl \func\()_\int_suffix\()_16_neon
+.if \rep >= 4
+.ifc \dir,h
+ add r0, r0, r1, lsl #2
+ bl \func\()_\int_suffix\()_16_neon
+ add r0, r0, r1, lsl #2
+ bl \func\()_\int_suffix\()_16_neon
+.else
+ add r0, r0, #8
+ bl \func\()_\int_suffix\()_16_neon
+ add r0, r0, #8
+ bl \func\()_\int_suffix\()_16_neon
+.endif
+.endif
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro bpp_frontends_rep func, suffix, int_suffix, rep, dir
+ bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 10
+ bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 12
+.endm
+
+.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
+function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
+ push {r4-r9,lr}
+ ldr r4, [sp, #28]
+ vpush {q4-q7}
+ push {r2, r3, r4}
+ and r2, r2, #0xff
+ and r3, r3, #0xff
+ and r4, r4, #0xff
+ lsl r2, r2, #\bpp - 8
+ lsl r3, r3, #\bpp - 8
+ lsl r4, r4, #\bpp - 8
+ mov r5, #1 << (\bpp - 8)
+ mov r6, #16 - \bpp
+ movw r7, #((1 << \bpp) - 1)
+ bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
+.ifc \dir,h
+ add r0, r0, r1, lsl #3
+.else
+ add r0, r0, #16
+.endif
+ pop {r2, r3, r4}
+ lsr r2, r2, #8
+ lsr r3, r3, #8
+ lsr r4, r4, #8
+ lsl r2, r2, #\bpp - 8
+ lsl r3, r3, #\bpp - 8
+ lsl r4, r4, #\bpp - 8
+ bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
+ vpop {q4-q7}
+ pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro bpp_frontends_mix2 wd1, wd2
+ bpp_frontend_mix2 \wd1, \wd2, v, 10
+ bpp_frontend_mix2 \wd1, \wd2, v, 12
+ bpp_frontend_mix2 \wd1, \wd2, h, 10
+ bpp_frontend_mix2 \wd1, \wd2, h, 12
+.endm
+
+function vp9_loop_filter_v_4_8_16_neon
+ sub r12, r0, r1, lsl #2
+ vld1.16 {q8}, [r12,:128], r1 @ p3
+ vld1.16 {q12}, [r0, :128], r1 @ q0
+ vld1.16 {q9}, [r12,:128], r1 @ p2
+ vld1.16 {q13}, [r0, :128], r1 @ q1
+ vld1.16 {q10}, [r12,:128], r1 @ p1
+ vld1.16 {q14}, [r0, :128], r1 @ q2
+ vld1.16 {q11}, [r12,:128], r1 @ p0
+ vld1.16 {q15}, [r0, :128], r1 @ q3
+ sub r0, r0, r1, lsl #2
+ sub r12, r12, r1, lsl #1
+
+ loop_filter_q_4
+
+ vst1.16 {q10}, [r12,:128], r1
+ vst1.16 {q12}, [r0, :128], r1
+ vst1.16 {q11}, [r12,:128], r1
+ vst1.16 {q13}, [r0, :128], r1
+ sub r0, r0, r1, lsl #1
+9:
+ bx lr
+endfunc
+
+bpp_frontends vp9_loop_filter_v_4_8
+
+
+function vp9_loop_filter_h_4_8_16_neon
+ sub r12, r0, #8
+ add r0, r12, r1, lsl #2
+ vld1.16 {q8}, [r12,:64], r1
+ vld1.16 {q12}, [r0, :64], r1
+ vld1.16 {q9}, [r12,:64], r1
+ vld1.16 {q13}, [r0, :64], r1
+ vld1.16 {q10}, [r12,:64], r1
+ vld1.16 {q14}, [r0, :64], r1
+ vld1.16 {q11}, [r12,:64], r1
+ vld1.16 {q15}, [r0, :64], r1
+
+ sub r12, r12, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
+ @ outermost 2 pixels since they aren't changed.
+ add r12, r12, #4
+ add r0, r0, #4
+
+ transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ loop_filter_q_4
+
+ @ We only will write the mid 4 pixels back; after the loop filter,
+ @ these are in q10, q11, q12, q13, ordered as rows (8x4 pixels).
+ @ We need to transpose them to columns, done with a
+ @ 4x4 transpose (which in practice is two 4x4 transposes of the two
+ @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
+ transpose16_4x4 q10, q11, q12, q13
+
+ vst1.16 {d20}, [r12], r1
+ vst1.16 {d21}, [r0], r1
+ vst1.16 {d22}, [r12], r1
+ vst1.16 {d23}, [r0], r1
+ vst1.16 {d24}, [r12], r1
+ vst1.16 {d25}, [r0], r1
+ vst1.16 {d26}, [r12], r1
+ vst1.16 {d27}, [r0], r1
+ sub r12, r12, r1, lsl #2
+9:
+ add r0, r12, #4
+ bx lr
+endfunc
+
+bpp_frontends vp9_loop_filter_h_4_8
+
+
+function vp9_loop_filter_v_8_8_16_neon
+ sub r12, r0, r1, lsl #2
+ vld1.16 {q8}, [r12,:128], r1 @ p3
+ vld1.16 {q12}, [r0, :128], r1 @ q0
+ vld1.16 {q9}, [r12,:128], r1 @ p2
+ vld1.16 {q13}, [r0, :128], r1 @ q1
+ vld1.16 {q10}, [r12,:128], r1 @ p1
+ vld1.16 {q14}, [r0, :128], r1 @ q2
+ vld1.16 {q11}, [r12,:128], r1 @ p0
+ vld1.16 {q15}, [r0, :128], r1 @ q3
+ sub r12, r12, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r12, r12, r1
+
+ loop_filter_q_8
+
+ vst1.16 {q9}, [r12,:128], r1
+ vst1.16 {q12}, [r0, :128], r1
+ vst1.16 {q10}, [r12,:128], r1
+ vst1.16 {q13}, [r0, :128], r1
+ vst1.16 {q11}, [r12,:128], r1
+ vst1.16 {q14}, [r0, :128], r1
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+9:
+ bx lr
+6:
+ sub r12, r0, r1, lsl #1
+ vst1.16 {q10}, [r12,:128], r1
+ vst1.16 {q12}, [r0, :128], r1
+ vst1.16 {q11}, [r12,:128], r1
+ vst1.16 {q13}, [r0, :128], r1
+ sub r0, r0, r1, lsl #1
+ bx lr
+endfunc
+
+bpp_frontends vp9_loop_filter_v_8_8
+
+
+function vp9_loop_filter_h_8_8_16_neon
+ sub r12, r0, #8
+ add r0, r12, r1, lsl #2
+ vld1.16 {q8}, [r12,:64], r1
+ vld1.16 {q12}, [r0, :64], r1
+ vld1.16 {q9}, [r12,:64], r1
+ vld1.16 {q13}, [r0, :64], r1
+ vld1.16 {q10}, [r12,:64], r1
+ vld1.16 {q14}, [r0, :64], r1
+ vld1.16 {q11}, [r12,:64], r1
+ vld1.16 {q15}, [r0, :64], r1
+
+ sub r12, r12, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+
+ transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ loop_filter_q_8
+
+ @ Even though only 6 pixels per row have been changed, we write the
+ @ full 8 pixel registers.
+ transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ vst1.16 {q8}, [r12,:64], r1
+ vst1.16 {q12}, [r0, :64], r1
+ vst1.16 {q9}, [r12,:64], r1
+ vst1.16 {q13}, [r0, :64], r1
+ vst1.16 {q10}, [r12,:64], r1
+ vst1.16 {q14}, [r0, :64], r1
+ vst1.16 {q11}, [r12,:64], r1
+ vst1.16 {q15}, [r0, :64], r1
+ sub r12, r12, r1, lsl #2
+9:
+ add r0, r12, #8
+ bx lr
+6:
+ @ If we didn't need to do the flat8in part, we use the same writeback
+ @ as in loop_filter_h_4_8.
+ add r12, r12, #4
+ add r0, r0, #4
+ transpose16_4x4 q10, q11, q12, q13
+
+ vst1.16 {d20}, [r12], r1
+ vst1.16 {d21}, [r0], r1
+ vst1.16 {d22}, [r12], r1
+ vst1.16 {d23}, [r0], r1
+ vst1.16 {d24}, [r12], r1
+ vst1.16 {d25}, [r0], r1
+ vst1.16 {d26}, [r12], r1
+ vst1.16 {d27}, [r0], r1
+ sub r12, r12, r1, lsl #2
+ add r0, r12, #4
+ bx lr
+endfunc
+
+bpp_frontends vp9_loop_filter_h_8_8
+
+bpp_frontends_mix2 4, 4
+bpp_frontends_mix2 4, 8
+bpp_frontends_mix2 8, 4
+bpp_frontends_mix2 8, 8
+
+function vp9_loop_filter_v_16_4_16_neon
+ sub r12, r0, r1, lsl #3
+ @ Read p7-p0 using r12 and q0-q7 using r0
+ vld1.16 {d16}, [r12,:64], r1 @ p7
+ vld1.16 {d24}, [r0, :64], r1 @ q0
+ vld1.16 {d17}, [r12,:64], r1 @ p6
+ vld1.16 {d25}, [r0, :64], r1 @ q1
+ vld1.16 {d18}, [r12,:64], r1 @ p5
+ vld1.16 {d26}, [r0, :64], r1 @ q2
+ vld1.16 {d19}, [r12,:64], r1 @ p4
+ vld1.16 {d27}, [r0, :64], r1 @ q3
+ vld1.16 {d20}, [r12,:64], r1 @ p3
+ vld1.16 {d28}, [r0, :64], r1 @ q4
+ vld1.16 {d21}, [r12,:64], r1 @ p2
+ vld1.16 {d29}, [r0, :64], r1 @ q5
+ vld1.16 {d22}, [r12,:64], r1 @ p1
+ vld1.16 {d30}, [r0, :64], r1 @ q6
+ vld1.16 {d23}, [r12,:64], r1 @ p0
+ vld1.16 {d31}, [r0, :64], r1 @ q7
+ sub r12, r12, r1, lsl #3
+ sub r0, r0, r1, lsl #3
+ add r12, r12, r1
+
+ loop_filter_16
+
+ @ If we did the flat8out part, we get the output in
+ @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
+ @ store d2-d9 there, and d10-d17 into r0.
+ vst1.16 {d2}, [r12,:64], r1
+ vst1.16 {d10}, [r0, :64], r1
+ vst1.16 {d3}, [r12,:64], r1
+ vst1.16 {d11}, [r0, :64], r1
+ vst1.16 {d4}, [r12,:64], r1
+ vst1.16 {d12}, [r0, :64], r1
+ vst1.16 {d5}, [r12,:64], r1
+ vst1.16 {d13}, [r0, :64], r1
+ vst1.16 {d6}, [r12,:64], r1
+ vst1.16 {d14}, [r0, :64], r1
+ vst1.16 {d8}, [r12,:64], r1
+ vst1.16 {d15}, [r0, :64], r1
+ vst1.16 {d9}, [r12,:64], r1
+ vst1.16 {d17}, [r0, :64], r1
+ sub r0, r0, r1, lsl #3
+ add r0, r0, r1
+
+9:
+ bx lr
+
+8:
+ add r12, r12, r1, lsl #2
+ @ If we didn't do the flat8out part, the output is left in the
+ @ input registers.
+ vst1.16 {d21}, [r12,:64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d22}, [r12,:64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d23}, [r12,:64], r1
+ vst1.16 {d26}, [r0, :64], r1
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx lr
+7:
+ sub r12, r0, r1, lsl #1
+ vst1.16 {d22}, [r12,:64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d23}, [r12,:64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ sub r0, r0, r1, lsl #1
+ bx lr
+endfunc
+
+bpp_frontends_rep vp9_loop_filter_v_16, 8, 4, 2, v
+bpp_frontends_rep vp9_loop_filter_v_16, 16, 4, 4, v
+
+function vp9_loop_filter_h_16_4_16_neon
+ sub r12, r0, #16
+ sub r0, r0, #8
+ vld1.16 {d16}, [r12,:64], r1
+ vld1.16 {d20}, [r0, :64], r1
+ vld1.16 {d17}, [r12,:64], r1
+ vld1.16 {d21}, [r0, :64], r1
+ vld1.16 {d18}, [r12,:64], r1
+ vld1.16 {d22}, [r0, :64], r1
+ vld1.16 {d19}, [r12,:64], r1
+ vld1.16 {d23}, [r0, :64], r1
+ sub r12, r12, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r12, r12, #16
+ add r0, r0, #16
+ vld1.16 {d24}, [r12,:64], r1
+ vld1.16 {d28}, [r0, :64], r1
+ vld1.16 {d25}, [r12,:64], r1
+ vld1.16 {d29}, [r0, :64], r1
+ vld1.16 {d26}, [r12,:64], r1
+ vld1.16 {d30}, [r0, :64], r1
+ vld1.16 {d27}, [r12,:64], r1
+ vld1.16 {d31}, [r0, :64], r1
+ sub r0, r0, r1, lsl #2
+ sub r12, r12, r1, lsl #2
+ sub r12, r12, #16
+ sub r0, r0, #16
+
+ @ The 16x4 pixels read above is in four 4x4 blocks
+ transpose16_q_4x4 q8, q9, d16, d17, d18, d19
+ transpose16_q_4x4 q10, q11, d20, d21, d22, d23
+ transpose16_q_4x4 q12, q13, d24, d25, d26, d27
+ transpose16_q_4x4 q14, q15, d28, d29, d30, d31
+
+ loop_filter_16
+
+ @ Transpose back; this is the same transpose as above, but
+ @ we can't take advantage of q registers for the transpose, since
+ @ all d registers in the transpose aren't consecutive.
+ transpose16_4x4 d16, d2, d3, d4
+ transpose16_4x4 d5, d6, d8, d9
+ transpose16_4x4 d10, d11, d12, d13
+ transpose16_4x4 d14, d15, d17, d31
+
+ vst1.16 {d16}, [r12,:64], r1
+ vst1.16 {d5}, [r0, :64], r1
+
+ vst1.16 {d2}, [r12,:64], r1
+ vst1.16 {d6}, [r0, :64], r1
+
+ vst1.16 {d3}, [r12,:64], r1
+ vst1.16 {d8}, [r0, :64], r1
+
+ vst1.16 {d4}, [r12,:64], r1
+ vst1.16 {d9}, [r0, :64], r1
+
+ sub r12, r12, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r12, r12, #16
+ add r0, r0, #16
+
+ vst1.16 {d10}, [r12,:64], r1
+ vst1.16 {d14}, [r0, :64], r1
+
+ vst1.16 {d11}, [r12,:64], r1
+ vst1.16 {d15}, [r0, :64], r1
+
+ vst1.16 {d12}, [r12,:64], r1
+ vst1.16 {d17}, [r0, :64], r1
+
+ vst1.16 {d13}, [r12,:64], r1
+ vst1.16 {d31}, [r0, :64], r1
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, #8
+ bx lr
+9:
+ add r0, r0, #8
+ bx lr
+8:
+ add r12, r12, #8
+ add r0, r0, #8
+ transpose16_q_4x4 q10, q11, d20, d21, d22, d23
+ transpose16_q_4x4 q12, q13, d24, d25, d26, d27
+
+ vst1.16 {d20}, [r12,:64], r1
+ vst1.16 {d24}, [r0, :64], r1
+ vst1.16 {d21}, [r12,:64], r1
+ vst1.16 {d25}, [r0, :64], r1
+ vst1.16 {d22}, [r12,:64], r1
+ vst1.16 {d26}, [r0, :64], r1
+ vst1.16 {d23}, [r12,:64], r1
+ vst1.16 {d27}, [r0, :64], r1
+ sub r0, r0, r1, lsl #2
+ bx lr
+7:
+ add r12, r12, #12
+ add r0, r12, r1, lsl #1
+ transpose16_q_4x4 q11, q12, d22, d23, d24, d25
+
+ vst1.16 {d22}, [r12], r1
+ vst1.16 {d24}, [r0], r1
+ vst1.16 {d23}, [r12], r1
+ vst1.16 {d25}, [r0], r1
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #4
+ bx lr
+endfunc
+
+bpp_frontends_rep vp9_loop_filter_h_16, 8, 4, 2, h
+bpp_frontends_rep vp9_loop_filter_h_16, 16, 4, 4, h
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
new file mode 100644
index 0000000000..4b3608064a
--- /dev/null
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -0,0 +1,959 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+@ Do an 8x8 transpose, using q registers for the subtransposes that don't
+@ need to address the indiviudal d registers.
+@ r0,r1 == rq0, r2,r3 == rq1, etc
+.macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7
+ vtrn.32 \rq0, \rq2
+ vtrn.32 \rq1, \rq3
+ vtrn.16 \rq0, \rq1
+ vtrn.16 \rq2, \rq3
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+ vtrn.8 \r4, \r5
+ vtrn.8 \r6, \r7
+.endm
+
+@ Do a 4x4 transpose, using q registers for the subtransposes that don't
+@ need to address the indiviudal d registers.
+@ r0,r1 == rq0, r2,r3 == rq1
+.macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3
+ vtrn.16 \rq0, \rq1
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+.endm
+
+@ The input to and output from this macro is in the registers q8-q15,
+@ and q0-q7 are used as scratch registers.
+@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
+.macro loop_filter_q
+ vdup.u8 d0, r2 @ E
+ lsr r2, r2, #8
+ vdup.u8 d2, r3 @ I
+ lsr r3, r3, #8
+ vdup.u8 d1, r2 @ E
+ vdup.u8 d3, r3 @ I
+
+ vabd.u8 q2, q8, q9 @ abs(p3 - p2)
+ vabd.u8 q3, q9, q10 @ abs(p2 - p1)
+ vabd.u8 q4, q10, q11 @ abs(p1 - p0)
+ vabd.u8 q5, q12, q13 @ abs(q0 - q1)
+ vabd.u8 q6, q13, q14 @ abs(q1 - q2)
+ vabd.u8 q7, q14, q15 @ abs(q2 - q3)
+ vmax.u8 q2, q2, q3
+ vmax.u8 q3, q4, q5
+ vmax.u8 q4, q6, q7
+ vabd.u8 q5, q11, q12 @ abs(p0 - q0)
+ vmax.u8 q2, q2, q3
+ vqadd.u8 q5, q5, q5 @ abs(p0 - q0) * 2
+ vabd.u8 q7, q10, q13 @ abs(p1 - q1)
+ vmax.u8 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3))
+ vshr.u8 q7, q7, #1
+ vcle.u8 q2, q2, q1 @ max(abs()) <= I
+ vqadd.u8 q5, q5, q7 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+ vcle.u8 q5, q5, q0
+ vand q2, q2, q5 @ fm
+
+ vshrn.u16 d10, q2, #4
+ vmov r2, r3, d10
+ orrs r2, r2, r3
+ @ If no pixels need filtering, just exit as soon as possible
+ beq 9f
+
+ @ Calculate the normal inner loop filter for 2 or 4 pixels
+ ldr r3, [sp, #64]
+ vabd.u8 q3, q10, q11 @ abs(p1 - p0)
+ vabd.u8 q4, q13, q12 @ abs(q1 - q0)
+
+ vsubl.u8 q5, d20, d26 @ p1 - q1
+ vsubl.u8 q6, d21, d27 @ p1 - q1
+ vmax.u8 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0))
+ vqmovn.s16 d10, q5 @ av_clip_int8p(p1 - q1)
+ vqmovn.s16 d11, q6 @ av_clip_int8p(p1 - q1)
+ vdup.u8 d8, r3 @ H
+ lsr r3, r3, #8
+ vdup.u8 d9, r3 @ H
+ vsubl.u8 q6, d24, d22 @ q0 - p0
+ vsubl.u8 q7, d25, d23 @ q0 - p0
+ vcle.u8 q3, q3, q4 @ hev
+ vmov.s16 q0, #3
+ vand q3, q3, q2 @ !hev && fm && !flat8in
+
+ vmul.s16 q6, q6, q0 @ 3 * (q0 - p0)
+ vmul.s16 q7, q7, q0 @ 3 * (q0 - p0)
+ vbic q5, q5, q3 @ if (!hev) av_clip_int8 = 0
+ vaddw.s8 q6, q6, d10 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+ vaddw.s8 q7, q7, d11 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+ vmov.s8 q5, #4
+ vqmovn.s16 d12, q6
+ vqmovn.s16 d13, q7 @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f
+ vmov.s8 q0, #3
+
+ vqadd.s8 q5, q6, q5 @ FFMIN(f + 4, 127)
+ vqadd.s8 q0, q6, q0 @ FFMIN(f + 3, 127)
+ vmovl.u8 q6, d22 @ p0
+ vmovl.u8 q7, d23 @ p0
+ vshr.s8 q5, q5, #3 @ f1
+ vshr.s8 q0, q0, #3 @ f2
+
+ vaddw.s8 q6, q6, d0 @ p0 + f2
+ vaddw.s8 q7, q7, d1 @ p0 + f2
+ vqmovun.s16 d0, q6 @ out p0
+ vmovl.u8 q6, d24 @ q0
+ vqmovun.s16 d1, q7 @ out p0
+ vmovl.u8 q7, d25 @ q0
+ vsubw.s8 q6, q6, d10 @ q0 - f1
+ vsubw.s8 q7, q7, d11 @ q0 - f1
+ vqmovun.s16 d12, q6 @ out q0
+ vqmovun.s16 d13, q7 @ out q0
+ vrshr.s8 q5, q5, #1 @ f = (f1 + 1) >> 1
+ vbit q11, q0, q2 @ if (fm && !flat8in)
+ vbit q12, q6, q2
+
+ vmovl.u8 q0, d20 @ p1
+ vmovl.u8 q2, d21 @ p1
+ vmovl.u8 q6, d26 @ q1
+ vmovl.u8 q7, d27 @ q1
+ vaddw.s8 q0, q0, d10 @ p1 + f
+ vaddw.s8 q2, q2, d11 @ p1 + f
+ vsubw.s8 q6, q6, d10 @ q1 - f
+ vsubw.s8 q7, q7, d11 @ q1 - f
+ vqmovun.s16 d0, q0 @ out p1
+ vqmovun.s16 d1, q2 @ out p1
+ vqmovun.s16 d12, q6 @ out q1
+ vqmovun.s16 d13, q7 @ out q1
+ vbit q10, q0, q3 @ if (!hev && fm && !flat8in)
+ vbit q13, q6, q3
+.endm
+
+@ The input to and output from this macro is in the registers d16-d31,
+@ and d0-d7 are used as scratch registers.
+@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
+@ Depending on the width of the loop filter, we either use d16-d19
+@ and d28-d31 as temp registers, or d8-d15.
+@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
+ vdup.u8 d0, r2 @ E
+ vdup.u8 d2, r3 @ I
+ ldr r3, [sp]
+
+ vabd.u8 d4, d20, d21 @ abs(p3 - p2)
+ vabd.u8 d5, d21, d22 @ abs(p2 - p1)
+ vabd.u8 d6, d22, d23 @ abs(p1 - p0)
+ vabd.u8 d7, d24, d25 @ abs(q0 - q1)
+ vabd.u8 \tmp1, d25, d26 @ abs(q1 - q2)
+ vabd.u8 \tmp2, d26, d27 @ abs(q2 - q3)
+ vmax.u8 d4, d4, d5
+ vmax.u8 d5, d6, d7
+ vmax.u8 \tmp1, \tmp1, \tmp2
+ vabd.u8 d6, d23, d24 @ abs(p0 - q0)
+ vmax.u8 d4, d4, d5
+ vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2
+ vabd.u8 d5, d22, d25 @ abs(p1 - q1)
+ vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
+ vshr.u8 d5, d5, #1
+ vcle.u8 d4, d4, d2 @ max(abs()) <= I
+ vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+ vcle.u8 d5, d6, d0
+ vand d4, d4, d5 @ fm
+
+ vdup.u8 d3, r3 @ H
+ vmov r2, r3, d4
+ orrs r2, r2, r3
+ @ If no pixels need filtering, just exit as soon as possible
+ beq 9f
+
+.if \wd >= 8
+ vmov.u8 d0, #1
+
+ vabd.u8 d6, d20, d23 @ abs(p3 - p0)
+ vabd.u8 d2, d21, d23 @ abs(p2 - p0)
+ vabd.u8 d1, d22, d23 @ abs(p1 - p0)
+ vabd.u8 \tmp1, d25, d24 @ abs(q1 - q0)
+ vabd.u8 \tmp2, d26, d24 @ abs(q2 - q0)
+ vabd.u8 \tmp3, d27, d24 @ abs(q3 - q0)
+ vmax.u8 d6, d6, d2
+ vmax.u8 d1, d1, \tmp1
+ vmax.u8 \tmp2, \tmp2, \tmp3
+.if \wd == 16
+ vabd.u8 d7, d16, d23 @ abs(p7 - p0)
+ vmax.u8 d6, d6, d1
+ vabd.u8 d2, d17, d23 @ abs(p6 - p0)
+ vmax.u8 d6, d6, \tmp2
+ vabd.u8 d1, d18, d23 @ abs(p5 - p0)
+ vcle.u8 d6, d6, d0 @ flat8in
+ vabd.u8 d8, d19, d23 @ abs(p4 - p0)
+ vand d6, d6, d4 @ flat8in && fm
+ vabd.u8 d9, d28, d24 @ abs(q4 - q0)
+ vbic d4, d4, d6 @ fm && !flat8in
+ vabd.u8 d10, d29, d24 @ abs(q5 - q0)
+ vabd.u8 d11, d30, d24 @ abs(q6 - q0)
+ vabd.u8 d12, d31, d24 @ abs(q7 - q0)
+
+ vmax.u8 d7, d7, d2
+ vmax.u8 d1, d1, d8
+ vmax.u8 d9, d9, d10
+ vmax.u8 d11, d11, d12
+ @ The rest of the calculation of flat8out is interleaved below
+.else
+ @ The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+ @ Calculate the normal inner loop filter for 2 or 4 pixels
+ vabd.u8 d5, d22, d23 @ abs(p1 - p0)
+.if \wd == 16
+ vmax.u8 d7, d7, d1
+ vmax.u8 d9, d9, d11
+.elseif \wd == 8
+ vmax.u8 d6, d6, d1
+.endif
+ vabd.u8 d1, d25, d24 @ abs(q1 - q0)
+.if \wd == 16
+ vmax.u8 d7, d7, d9
+.elseif \wd == 8
+ vmax.u8 d6, d6, \tmp2
+.endif
+ vsubl.u8 \tmpq1, d22, d25 @ p1 - q1
+ vmax.u8 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0))
+ vsubl.u8 \tmpq2, d24, d23 @ q0 - p0
+ vmov.s16 \tmpq3, #3
+.if \wd == 8
+ vcle.u8 d6, d6, d0 @ flat8in
+.endif
+ vcle.u8 d5, d5, d3 @ !hev
+.if \wd == 8
+ vand d6, d6, d4 @ flat8in && fm
+.endif
+ vqmovn.s16 \tmp1, \tmpq1 @ av_clip_int8(p1 - q1)
+.if \wd == 16
+ vcle.u8 d7, d7, d0 @ flat8out
+.elseif \wd == 8
+ vbic d4, d4, d6 @ fm && !flat8in
+.endif
+ vand d5, d5, d4 @ !hev && fm && !flat8in
+.if \wd == 16
+ vand d7, d7, d6 @ flat8out && flat8in && fm
+.endif
+
+ vmul.s16 \tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0)
+ vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int8 = 0
+ vmov.s8 d2, #4
+ vaddw.s8 \tmpq2, \tmpq2, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+ vmov.s8 d3, #3
+ vqmovn.s16 \tmp1, \tmpq2 @ f
+.if \wd == 16
+ vbic d6, d6, d7 @ fm && flat8in && !flat8out
+.endif
+
+ vqadd.s8 \tmp3, \tmp1, d2 @ FFMIN(f + 4, 127)
+ vqadd.s8 \tmp4, \tmp1, d3 @ FFMIN(f + 3, 127)
+ vmovl.u8 q0, d23 @ p0
+ vshr.s8 \tmp3, \tmp3, #3 @ f1
+ vshr.s8 \tmp4, \tmp4, #3 @ f2
+
+ vmovl.u8 q1, d24 @ q0
+ vaddw.s8 q0, q0, \tmp4 @ p0 + f2
+ vsubw.s8 q1, q1, \tmp3 @ q0 - f1
+ vqmovun.s16 d0, q0 @ out p0
+ vqmovun.s16 d1, q1 @ out q0
+ vrshr.s8 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1
+ vbit d23, d0, d4 @ if (fm && !flat8in)
+ vbit d24, d1, d4
+
+ vmovl.u8 q0, d22 @ p1
+ vmovl.u8 q1, d25 @ q1
+.if \wd >= 8
+ vmov r2, r3, d6
+.endif
+ vaddw.s8 q0, q0, \tmp3 @ p1 + f
+ vsubw.s8 q1, q1, \tmp3 @ q1 - f
+.if \wd >= 8
+ orrs r2, r2, r3
+.endif
+ vqmovun.s16 d0, q0 @ out p1
+ vqmovun.s16 d2, q1 @ out q1
+ vbit d22, d0, d5 @ if (!hev && fm && !flat8in)
+ vbit d25, d2, d5
+
+.if \wd >= 8
+ @ If no pixels need flat8in, jump to flat8out
+ @ (or to a writeout of the inner 4 pixels, for wd=8)
+ beq 6f
+
+ @ flat8in
+ vaddl.u8 \tmpq1, d20, d21
+ vaddl.u8 \tmpq2, d22, d25
+ vaddl.u8 \tmpq3, d20, d22
+ vaddl.u8 \tmpq4, d23, d26
+ vadd.u16 q0, \tmpq1, \tmpq1
+ vaddw.u8 q0, q0, d23
+ vaddw.u8 q0, q0, d24
+ vadd.u16 q0, q0, \tmpq3
+ vsub.s16 \tmpq2, \tmpq2, \tmpq1
+ vsub.s16 \tmpq4, \tmpq4, \tmpq3
+ vrshrn.u16 d2, q0, #3 @ out p2
+
+ vadd.u16 q0, q0, \tmpq2
+ vaddl.u8 \tmpq1, d20, d23
+ vaddl.u8 \tmpq2, d24, d27
+ vrshrn.u16 d3, q0, #3 @ out p1
+
+ vadd.u16 q0, q0, \tmpq4
+ vsub.s16 \tmpq2, \tmpq2, \tmpq1
+ vaddl.u8 \tmpq3, d21, d24
+ vaddl.u8 \tmpq4, d25, d27
+ vrshrn.u16 d4, q0, #3 @ out p0
+
+ vadd.u16 q0, q0, \tmpq2
+ vsub.s16 \tmpq4, \tmpq4, \tmpq3
+ vaddl.u8 \tmpq1, d22, d25
+ vaddl.u8 \tmpq2, d26, d27
+ vrshrn.u16 d5, q0, #3 @ out q0
+
+ vadd.u16 q0, q0, \tmpq4
+ vsub.s16 \tmpq2, \tmpq2, \tmpq1
+ vrshrn.u16 \tmp5, q0, #3 @ out q1
+
+ vadd.u16 q0, q0, \tmpq2
+ @ The output here is written back into the input registers. This doesn't
+ @ matter for the flat8out part below, since we only update those pixels
+ @ which won't be touched below.
+ vbit d21, d2, d6
+ vbit d22, d3, d6
+ vbit d23, d4, d6
+ vrshrn.u16 \tmp6, q0, #3 @ out q2
+ vbit d24, d5, d6
+ vbit d25, \tmp5, d6
+ vbit d26, \tmp6, d6
+.endif
+.if \wd == 16
+6:
+ vorr d2, d6, d7
+ vmov r2, r3, d2
+ orrs r2, r2, r3
+ @ If no pixels needed flat8in nor flat8out, jump to a
+ @ writeout of the inner 4 pixels
+ beq 7f
+ vmov r2, r3, d7
+ orrs r2, r2, r3
+ @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+ beq 8f
+
+ @ flat8out
+ @ This writes all outputs into d2-d17 (skipping d6 and d16).
+ @ If this part is skipped, the output is read from d21-d26 (which is the input
+ @ to this section).
+ vshll.u8 q0, d16, #3 @ 8 * d16
+ vsubw.u8 q0, q0, d16 @ 7 * d16
+ vaddw.u8 q0, q0, d17
+ vaddl.u8 q4, d17, d18
+ vaddl.u8 q5, d19, d20
+ vadd.s16 q0, q0, q4
+ vaddl.u8 q4, d16, d17
+ vaddl.u8 q6, d21, d22
+ vadd.s16 q0, q0, q5
+ vaddl.u8 q5, d18, d25
+ vaddl.u8 q7, d23, d24
+ vsub.s16 q5, q5, q4
+ vadd.s16 q0, q0, q6
+ vadd.s16 q0, q0, q7
+ vaddl.u8 q6, d16, d18
+ vaddl.u8 q7, d19, d26
+ vrshrn.u16 d2, q0, #4
+
+ vadd.s16 q0, q0, q5
+ vaddl.u8 q4, d16, d19
+ vaddl.u8 q5, d20, d27
+ vsub.s16 q7, q7, q6
+ vbif d2, d17, d7
+ vrshrn.u16 d3, q0, #4
+
+ vadd.s16 q0, q0, q7
+ vaddl.u8 q6, d16, d20
+ vaddl.u8 q7, d21, d28
+ vsub.s16 q5, q5, q4
+ vbif d3, d18, d7
+ vrshrn.u16 d4, q0, #4
+
+ vadd.s16 q0, q0, q5
+ vaddl.u8 q4, d16, d21
+ vaddl.u8 q5, d22, d29
+ vsub.s16 q7, q7, q6
+ vbif d4, d19, d7
+ vrshrn.u16 d5, q0, #4
+
+ vadd.s16 q0, q0, q7
+ vaddl.u8 q6, d16, d22
+ vaddl.u8 q7, d23, d30
+ vsub.s16 q5, q5, q4
+ vbif d5, d20, d7
+ vrshrn.u16 d6, q0, #4
+
+ vadd.s16 q0, q0, q5
+ vaddl.u8 q5, d16, d23
+ vsub.s16 q7, q7, q6
+ vaddl.u8 q6, d24, d31
+ vbif d6, d21, d7
+ vrshrn.u16 d8, q0, #4
+
+ vadd.s16 q0, q0, q7
+ vsub.s16 q5, q6, q5
+ vaddl.u8 q6, d17, d24
+ vaddl.u8 q7, d25, d31
+ vbif d8, d22, d7
+ vrshrn.u16 d9, q0, #4
+
+ vadd.s16 q0, q0, q5
+ vsub.s16 q7, q7, q6
+ vaddl.u8 q6, d26, d31
+ vbif d9, d23, d7
+ vrshrn.u16 d10, q0, #4
+
+ vadd.s16 q0, q0, q7
+ vaddl.u8 q7, d18, d25
+ vaddl.u8 q9, d19, d26
+ vsub.s16 q6, q6, q7
+ vaddl.u8 q7, d27, d31
+ vbif d10, d24, d7
+ vrshrn.u16 d11, q0, #4
+
+ vadd.s16 q0, q0, q6
+ vaddl.u8 q6, d20, d27
+ vsub.s16 q7, q7, q9
+ vaddl.u8 q9, d28, d31
+ vbif d11, d25, d7
+ vsub.s16 q9, q9, q6
+ vrshrn.u16 d12, q0, #4
+
+ vadd.s16 q0, q0, q7
+ vaddl.u8 q7, d21, d28
+ vaddl.u8 q10, d29, d31
+ vbif d12, d26, d7
+ vrshrn.u16 d13, q0, #4
+
+ vadd.s16 q0, q0, q9
+ vsub.s16 q10, q10, q7
+ vaddl.u8 q9, d22, d29
+ vaddl.u8 q11, d30, d31
+ vbif d13, d27, d7
+ vrshrn.u16 d14, q0, #4
+
+ vadd.s16 q0, q0, q10
+ vsub.s16 q11, q11, q9
+ vbif d14, d28, d7
+ vrshrn.u16 d15, q0, #4
+
+ vadd.s16 q0, q0, q11
+ vbif d15, d29, d7
+ vrshrn.u16 d17, q0, #4
+ vbif d17, d30, d7
+.endif
+.endm
+
+@ For wd <= 8, we use d16-d19 and d28-d31 for temp registers,
+@ while we need those for inputs/outputs in wd=16 and use d8-d15
+@ for temp registers there instead.
+.macro loop_filter_4
+ loop_filter 4, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
+.endm
+
+.macro loop_filter_8
+ loop_filter 8, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
+.endm
+
+.macro loop_filter_16
+ loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15, q4, q5, q6, q7
+.endm
+
+
+@ The public functions in this file have got the following signature:
+@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+function ff_vp9_loop_filter_v_4_8_neon, export=1
+ sub r12, r0, r1, lsl #2
+ vld1.8 {d20}, [r12,:64], r1 @ p3
+ vld1.8 {d24}, [r0, :64], r1 @ q0
+ vld1.8 {d21}, [r12,:64], r1 @ p2
+ vld1.8 {d25}, [r0, :64], r1 @ q1
+ vld1.8 {d22}, [r12,:64], r1 @ p1
+ vld1.8 {d26}, [r0, :64], r1 @ q2
+ vld1.8 {d23}, [r12,:64], r1 @ p0
+ vld1.8 {d27}, [r0, :64], r1 @ q3
+ sub r0, r0, r1, lsl #2
+ sub r12, r12, r1, lsl #1
+
+ loop_filter_4
+
+ vst1.8 {d22}, [r12,:64], r1
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d23}, [r12,:64], r1
+ vst1.8 {d25}, [r0, :64], r1
+9:
+ bx lr
+endfunc
+
+function ff_vp9_loop_filter_h_4_8_neon, export=1
+ sub r12, r0, #4
+ add r0, r12, r1, lsl #2
+ vld1.8 {d20}, [r12], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d21}, [r12], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d22}, [r12], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d23}, [r12], r1
+ vld1.8 {d27}, [r0], r1
+
+ sub r12, r12, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
+ @ outermost 2 pixels since they aren't changed.
+ add r12, r12, #2
+ add r0, r0, #2
+
+ @ Transpose the 8x8 pixels, taking advantage of q registers, to get
+ @ one register per column.
+ transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+ loop_filter_4
+
+ @ We only will write the mid 4 pixels back; after the loop filter,
+ @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows
+ @ (8x4 pixels). We need to transpose them to columns, done with a
+ @ 4x4 transpose (which in practice is two 4x4 transposes of the two
+ @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
+ transpose_q_4x4 q11, q12, d22, d23, d24, d25
+
+ vst1.32 {d22[0]}, [r12], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r12], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r12], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r12], r1
+ vst1.32 {d25[1]}, [r0], r1
+9:
+ bx lr
+endfunc
+
+function ff_vp9_loop_filter_v_44_16_neon, export=1
+ vpush {q4-q7}
+ sub r12, r0, r1, lsl #2
+ vld1.8 {q8}, [r12,:128], r1 @ p3
+ vld1.8 {q12}, [r0, :128], r1 @ q0
+ vld1.8 {q9}, [r12,:128], r1 @ p2
+ vld1.8 {q13}, [r0, :128], r1 @ q1
+ vld1.8 {q10}, [r12,:128], r1 @ p1
+ vld1.8 {q14}, [r0, :128], r1 @ q2
+ vld1.8 {q11}, [r12,:128], r1 @ p0
+ vld1.8 {q15}, [r0, :128], r1 @ q3
+ sub r0, r0, r1, lsl #2
+ sub r12, r12, r1, lsl #1
+
+ loop_filter_q
+
+ vst1.8 {q10}, [r12,:128], r1
+ vst1.8 {q12}, [r0, :128], r1
+ vst1.8 {q11}, [r12,:128], r1
+ vst1.8 {q13}, [r0, :128], r1
+9:
+ vpop {q4-q7}
+ bx lr
+endfunc
+
+function ff_vp9_loop_filter_h_44_16_neon, export=1
+ vpush {q4-q7}
+ sub r12, r0, #4
+ add r0, r12, r1, lsl #2
+ vld1.8 {d16}, [r12], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d18}, [r12], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d20}, [r12], r1
+ vld1.8 {d28}, [r0], r1
+ vld1.8 {d22}, [r12], r1
+ vld1.8 {d30}, [r0], r1
+ mov r12, r0
+ add r0, r0, r1, lsl #2
+ vld1.8 {d17}, [r12], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d19}, [r12], r1
+ vld1.8 {d27}, [r0], r1
+ vld1.8 {d21}, [r12], r1
+ vld1.8 {d29}, [r0], r1
+ vld1.8 {d23}, [r12], r1
+ vld1.8 {d31}, [r0], r1
+
+ @ Transpose the 16x8 pixels, as two 8x8 parts
+ transpose_8x8 q8, q9, q10, q11, q12, q13, q14, q15
+
+ loop_filter_q
+
+ sub r12, r0, r1, lsl #4
+ add r0, r12, r1, lsl #3
+ @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
+ @ outermost 2 pixels since they aren't changed.
+ add r12, r12, #2
+ add r0, r0, #2
+
+ @ We only will write the mid 4 pixels back; after the loop filter,
+ @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels).
+ @ We need to transpose them to columns, done with a 4x4 transpose
+ @ (which in practice is four 4x4 transposes of the 4x4 blocks of
+ @ the 16x4 pixels; into 4x16 pixels).
+ transpose_4x4 q10, q11, q12, q13
+
+ vst1.32 {d20[0]}, [r12], r1
+ vst1.32 {d21[0]}, [r0], r1
+ vst1.32 {d22[0]}, [r12], r1
+ vst1.32 {d23[0]}, [r0], r1
+ vst1.32 {d24[0]}, [r12], r1
+ vst1.32 {d25[0]}, [r0], r1
+ vst1.32 {d26[0]}, [r12], r1
+ vst1.32 {d27[0]}, [r0], r1
+ vst1.32 {d20[1]}, [r12], r1
+ vst1.32 {d21[1]}, [r0], r1
+ vst1.32 {d22[1]}, [r12], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[1]}, [r12], r1
+ vst1.32 {d25[1]}, [r0], r1
+ vst1.32 {d26[1]}, [r12], r1
+ vst1.32 {d27[1]}, [r0], r1
+9:
+ vpop {q4-q7}
+ bx lr
+endfunc
+
+function ff_vp9_loop_filter_v_8_8_neon, export=1
+ sub r12, r0, r1, lsl #2
+ vld1.8 {d20}, [r12,:64], r1 @ p3
+ vld1.8 {d24}, [r0, :64], r1 @ q0
+ vld1.8 {d21}, [r12,:64], r1 @ p2
+ vld1.8 {d25}, [r0, :64], r1 @ q1
+ vld1.8 {d22}, [r12,:64], r1 @ p1
+ vld1.8 {d26}, [r0, :64], r1 @ q2
+ vld1.8 {d23}, [r12,:64], r1 @ p0
+ vld1.8 {d27}, [r0, :64], r1 @ q3
+ sub r12, r12, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+ add r12, r12, r1
+
+ loop_filter_8
+
+ vst1.8 {d21}, [r12,:64], r1
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d22}, [r12,:64], r1
+ vst1.8 {d25}, [r0, :64], r1
+ vst1.8 {d23}, [r12,:64], r1
+ vst1.8 {d26}, [r0, :64], r1
+9:
+ bx lr
+6:
+ sub r12, r0, r1, lsl #1
+ vst1.8 {d22}, [r12,:64], r1
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d23}, [r12,:64], r1
+ vst1.8 {d25}, [r0, :64], r1
+ bx lr
+endfunc
+
+function ff_vp9_loop_filter_h_8_8_neon, export=1
+ sub r12, r0, #4
+ add r0, r12, r1, lsl #2
+ vld1.8 {d20}, [r12], r1
+ vld1.8 {d24}, [r0], r1
+ vld1.8 {d21}, [r12], r1
+ vld1.8 {d25}, [r0], r1
+ vld1.8 {d22}, [r12], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d23}, [r12], r1
+ vld1.8 {d27}, [r0], r1
+
+ sub r12, r12, r1, lsl #2
+ sub r0, r0, r1, lsl #2
+
+ transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+ loop_filter_8
+
+ @ Even though only 6 pixels per row have been changed, we write the
+ @ full 8 pixel registers.
+ transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+ vst1.8 {d20}, [r12], r1
+ vst1.8 {d24}, [r0], r1
+ vst1.8 {d21}, [r12], r1
+ vst1.8 {d25}, [r0], r1
+ vst1.8 {d22}, [r12], r1
+ vst1.8 {d26}, [r0], r1
+ vst1.8 {d23}, [r12], r1
+ vst1.8 {d27}, [r0], r1
+9:
+ bx lr
+6:
+ @ If we didn't need to do the flat8in part, we use the same writeback
+ @ as in loop_filter_h_4_8.
+ add r12, r12, #2
+ add r0, r0, #2
+ transpose_q_4x4 q11, q12, d22, d23, d24, d25
+ vst1.32 {d22[0]}, [r12], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r12], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r12], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r12], r1
+ vst1.32 {d25[1]}, [r0], r1
+ bx lr
+endfunc
+
+function vp9_loop_filter_v_16_neon
+ sub r12, r0, r1, lsl #3
+ @ Read p7-p0 using r12 and q0-q7 using r0
+ vld1.8 {d16}, [r12,:64], r1 @ p7
+ vld1.8 {d24}, [r0, :64], r1 @ q0
+ vld1.8 {d17}, [r12,:64], r1 @ p6
+ vld1.8 {d25}, [r0, :64], r1 @ q1
+ vld1.8 {d18}, [r12,:64], r1 @ p5
+ vld1.8 {d26}, [r0, :64], r1 @ q2
+ vld1.8 {d19}, [r12,:64], r1 @ p4
+ vld1.8 {d27}, [r0, :64], r1 @ q3
+ vld1.8 {d20}, [r12,:64], r1 @ p3
+ vld1.8 {d28}, [r0, :64], r1 @ q4
+ vld1.8 {d21}, [r12,:64], r1 @ p2
+ vld1.8 {d29}, [r0, :64], r1 @ q5
+ vld1.8 {d22}, [r12,:64], r1 @ p1
+ vld1.8 {d30}, [r0, :64], r1 @ q6
+ vld1.8 {d23}, [r12,:64], r1 @ p0
+ vld1.8 {d31}, [r0, :64], r1 @ q7
+ sub r12, r12, r1, lsl #3
+ sub r0, r0, r1, lsl #3
+ add r12, r12, r1
+
+ loop_filter_16
+
+ @ If we did the flat8out part, we get the output in
+ @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
+ @ store d2-d9 there, and d10-d17 into r0.
+ vst1.8 {d2}, [r12,:64], r1
+ vst1.8 {d10}, [r0, :64], r1
+ vst1.8 {d3}, [r12,:64], r1
+ vst1.8 {d11}, [r0, :64], r1
+ vst1.8 {d4}, [r12,:64], r1
+ vst1.8 {d12}, [r0, :64], r1
+ vst1.8 {d5}, [r12,:64], r1
+ vst1.8 {d13}, [r0, :64], r1
+ vst1.8 {d6}, [r12,:64], r1
+ vst1.8 {d14}, [r0, :64], r1
+ vst1.8 {d8}, [r12,:64], r1
+ vst1.8 {d15}, [r0, :64], r1
+ vst1.8 {d9}, [r12,:64], r1
+ vst1.8 {d17}, [r0, :64], r1
+ sub r0, r0, r1, lsl #3
+ add r0, r0, r1
+
+9:
+ bx lr
+
+8:
+ add r12, r12, r1, lsl #2
+ @ If we didn't do the flat8out part, the output is left in the
+ @ input registers.
+ vst1.8 {d21}, [r12,:64], r1
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d22}, [r12,:64], r1
+ vst1.8 {d25}, [r0, :64], r1
+ vst1.8 {d23}, [r12,:64], r1
+ vst1.8 {d26}, [r0, :64], r1
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+ bx lr
+7:
+ sub r12, r0, r1, lsl #1
+ vst1.8 {d22}, [r12,:64], r1
+ vst1.8 {d24}, [r0, :64], r1
+ vst1.8 {d23}, [r12,:64], r1
+ vst1.8 {d25}, [r0, :64], r1
+ sub r0, r0, r1, lsl #1
+ bx lr
+endfunc
+
+function ff_vp9_loop_filter_v_16_8_neon, export=1
+ ldr r12, [sp]
+ push {lr}
+ vpush {q4-q7}
+ push {r12}
+ bl vp9_loop_filter_v_16_neon
+ add sp, sp, #4
+ vpop {q4-q7}
+ pop {pc}
+endfunc
+
+function ff_vp9_loop_filter_v_16_16_neon, export=1
+ ldr r12, [sp]
+ // The filter clobbers r2 and r3, but we need to keep them for the second round
+ push {r2, r3, lr}
+ vpush {q4-q7}
+ push {r12}
+ bl vp9_loop_filter_v_16_neon
+ add r0, #8
+ ldr r2, [sp, #68]
+ ldr r3, [sp, #72]
+ bl vp9_loop_filter_v_16_neon
+ add sp, sp, #4
+ vpop {q4-q7}
+ pop {r2, r3, pc}
+endfunc
+
+function vp9_loop_filter_h_16_neon
+ sub r12, r0, #8
+ vld1.8 {d16}, [r12,:64], r1
+ vld1.8 {d24}, [r0, :64], r1
+ vld1.8 {d17}, [r12,:64], r1
+ vld1.8 {d25}, [r0, :64], r1
+ vld1.8 {d18}, [r12,:64], r1
+ vld1.8 {d26}, [r0, :64], r1
+ vld1.8 {d19}, [r12,:64], r1
+ vld1.8 {d27}, [r0, :64], r1
+ vld1.8 {d20}, [r12,:64], r1
+ vld1.8 {d28}, [r0, :64], r1
+ vld1.8 {d21}, [r12,:64], r1
+ vld1.8 {d29}, [r0, :64], r1
+ vld1.8 {d22}, [r12,:64], r1
+ vld1.8 {d30}, [r0, :64], r1
+ vld1.8 {d23}, [r12,:64], r1
+ vld1.8 {d31}, [r0, :64], r1
+ sub r0, r0, r1, lsl #3
+ sub r12, r12, r1, lsl #3
+
+ @ The 16x8 pixels read above is in two 8x8 blocks; the left
+ @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes
+ @ of this, to get one column per register. This could be done with two
+ @ transpose_8x8 as below, but this takes advantage of the q registers.
+ transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
+ vtrn.8 d16, d17
+ vtrn.8 d18, d19
+ vtrn.8 d20, d21
+ vtrn.8 d22, d23
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+ vtrn.8 d28, d29
+ vtrn.8 d30, d31
+
+ loop_filter_16
+
+ @ Transpose back; this is the same transpose as above, but
+ @ we can't take advantage of q registers for the transpose, since
+ @ all d registers in the transpose aren't consecutive.
+ transpose_8x8 d16, d2, d3, d4, d5, d6, d8, d9
+ transpose_8x8 d10, d11, d12, d13, d14, d15, d17, d31
+
+ vst1.8 {d16}, [r12,:64], r1
+ vst1.8 {d10}, [r0, :64], r1
+
+ vst1.8 {d2}, [r12,:64], r1
+ vst1.8 {d11}, [r0, :64], r1
+
+ vst1.8 {d3}, [r12,:64], r1
+ vst1.8 {d12}, [r0, :64], r1
+
+ vst1.8 {d4}, [r12,:64], r1
+ vst1.8 {d13}, [r0, :64], r1
+
+ vst1.8 {d5}, [r12,:64], r1
+ vst1.8 {d14}, [r0, :64], r1
+
+ vst1.8 {d6}, [r12,:64], r1
+ vst1.8 {d15}, [r0, :64], r1
+
+ vst1.8 {d8}, [r12,:64], r1
+ vst1.8 {d17}, [r0, :64], r1
+
+ vst1.8 {d9}, [r12,:64], r1
+ vst1.8 {d31}, [r0, :64], r1
+ sub r0, r0, r1, lsl #3
+9:
+ bx lr
+8:
+ @ The same writeback as in loop_filter_h_8_8
+ sub r12, r0, #4
+ add r0, r12, r1, lsl #2
+ transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+ vst1.8 {d20}, [r12], r1
+ vst1.8 {d24}, [r0], r1
+ vst1.8 {d21}, [r12], r1
+ vst1.8 {d25}, [r0], r1
+ vst1.8 {d22}, [r12], r1
+ vst1.8 {d26}, [r0], r1
+ vst1.8 {d23}, [r12], r1
+ vst1.8 {d27}, [r0], r1
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #4
+ bx lr
+7:
+ @ The same writeback as in loop_filter_h_4_8
+ sub r12, r0, #2
+ add r0, r12, r1, lsl #2
+ transpose_q_4x4 q11, q12, d22, d23, d24, d25
+ vst1.32 {d22[0]}, [r12], r1
+ vst1.32 {d22[1]}, [r0], r1
+ vst1.32 {d23[0]}, [r12], r1
+ vst1.32 {d23[1]}, [r0], r1
+ vst1.32 {d24[0]}, [r12], r1
+ vst1.32 {d24[1]}, [r0], r1
+ vst1.32 {d25[0]}, [r12], r1
+ vst1.32 {d25[1]}, [r0], r1
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #2
+ bx lr
+endfunc
+
+function ff_vp9_loop_filter_h_16_8_neon, export=1
+ ldr r12, [sp]
+ push {lr}
+ vpush {q4-q7}
+ push {r12}
+ bl vp9_loop_filter_h_16_neon
+ add sp, sp, #4
+ vpop {q4-q7}
+ pop {pc}
+endfunc
+
+function ff_vp9_loop_filter_h_16_16_neon, export=1
+ ldr r12, [sp]
+ // The filter clobbers r2 and r3, but we need to keep them for the second round
+ push {r2, r3, lr}
+ vpush {q4-q7}
+ push {r12}
+ bl vp9_loop_filter_h_16_neon
+ add r0, r0, r1, lsl #3
+ ldr r2, [sp, #68]
+ ldr r3, [sp, #72]
+ bl vp9_loop_filter_h_16_neon
+ add sp, sp, #4
+ vpop {q4-q7}
+ pop {r2, r3, pc}
+endfunc
diff --git a/libavcodec/arm/vp9mc_16bpp_neon.S b/libavcodec/arm/vp9mc_16bpp_neon.S
new file mode 100644
index 0000000000..f6ec0375f2
--- /dev/null
+++ b/libavcodec/arm/vp9mc_16bpp_neon.S
@@ -0,0 +1,615 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ All public functions in this file have the following signature:
+@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+@ const uint8_t *ref, ptrdiff_t ref_stride,
+@ int h, int mx, int my);
+
+function ff_vp9_copy128_neon, export=1
+ ldr r12, [sp]
+ sub r1, r1, #96
+ sub r3, r3, #96
+1:
+ subs r12, r12, #1
+ vld1.16 {q0, q1}, [r2]!
+ vst1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q2, q3}, [r2]!
+ vst1.16 {q2, q3}, [r0, :128]!
+ vld1.16 {q8, q9}, [r2]!
+ vst1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2], r3
+ vst1.16 {q10, q11}, [r0, :128], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg64_16_neon, export=1
+ push {lr}
+ ldr r12, [sp, #4]
+ sub r1, r1, #96
+ sub r3, r3, #96
+ mov lr, r0
+1:
+ subs r12, r12, #1
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2]!
+ vrhadd.u16 q0, q0, q8
+ vld1.16 {q2, q3}, [r0, :128]!
+ vrhadd.u16 q1, q1, q9
+ vld1.16 {q12, q13}, [r2]!
+ vrhadd.u16 q2, q2, q10
+ vst1.16 {q0, q1}, [lr, :128]!
+ vrhadd.u16 q3, q3, q11
+ vld1.16 {q8, q9}, [r0, :128]!
+ vst1.16 {q2, q3}, [lr, :128]!
+ vrhadd.u16 q8, q8, q12
+ vld1.16 {q14, q15}, [r2], r3
+ vrhadd.u16 q9, q9, q13
+ vld1.16 {q10, q11}, [r0, :128], r1
+ vrhadd.u16 q10, q10, q14
+ vst1.16 {q8, q9}, [lr, :128]!
+ vrhadd.u16 q11, q11, q15
+ vst1.16 {q10, q11}, [lr, :128], r1
+ bne 1b
+ pop {pc}
+endfunc
+
+function ff_vp9_avg32_16_neon, export=1
+ push {lr}
+ ldr r12, [sp, #4]
+ sub r1, r1, #32
+ sub r3, r3, #32
+ mov lr, r0
+1:
+ subs r12, r12, #1
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q0, q1}, [r0, :128]!
+ vld1.16 {q10, q11}, [r2], r3
+ vrhadd.u16 q0, q0, q8
+ vld1.16 {q2, q3}, [r0, :128], r1
+ vrhadd.u16 q1, q1, q9
+ vrhadd.u16 q2, q2, q10
+ vst1.16 {q0, q1}, [lr, :128]!
+ vrhadd.u16 q3, q3, q11
+ vst1.16 {q2, q3}, [lr, :128], r1
+ bne 1b
+ pop {pc}
+endfunc
+
+function ff_vp9_avg16_16_neon, export=1
+ ldr r12, [sp]
+1:
+ subs r12, r12, #1
+ vld1.16 {q2, q3}, [r2], r3
+ vld1.16 {q0, q1}, [r0, :128]
+ vrhadd.u16 q0, q0, q2
+ vrhadd.u16 q1, q1, q3
+ vst1.16 {q0, q1}, [r0, :128], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg8_16_neon, export=1
+ push {lr}
+ ldr r12, [sp, #4]
+ mov lr, r0
+1:
+ subs r12, r12, #2
+ vld1.16 {q2}, [r2], r3
+ vld1.16 {q0}, [r0, :128], r1
+ vld1.16 {q3}, [r2], r3
+ vrhadd.u16 q0, q0, q2
+ vld1.16 {q1}, [r0, :128], r1
+ vrhadd.u16 q1, q1, q3
+ vst1.16 {q0}, [lr, :128], r1
+ vst1.16 {q1}, [lr, :128], r1
+ bne 1b
+ pop {pc}
+endfunc
+
+function ff_vp9_avg4_16_neon, export=1
+ ldr r12, [sp]
+1:
+ subs r12, r12, #2
+ vld1.16 {d2}, [r2], r3
+ vld1.16 {d0}, [r0, :64], r1
+ vld1.16 {d3}, [r2], r3
+ vrhadd.u16 d0, d0, d2
+ vld1.16 {d1}, [r0, :64]
+ sub r0, r0, r1
+ vrhadd.u16 d1, d1, d3
+ vst1.16 {d0}, [r0, :64], r1
+ vst1.16 {d1}, [r0, :64], r1
+ bne 1b
+ bx lr
+endfunc
+
+@ Helper macros for vmull/vmlal with a constant from either d0 or d1 depending on index
+.macro vmull_lane dst, src, idx
+.if \idx < 4
+ vmull.s16 \dst, \src, d0[\idx]
+.else
+ vmull.s16 \dst, \src, d1[\idx - 4]
+.endif
+.endm
+.macro vmlal_lane dst, src, idx
+.if \idx < 4
+ vmlal.s16 \dst, \src, d0[\idx]
+.else
+ vmlal.s16 \dst, \src, d1[\idx - 4]
+.endif
+.endm
+
+@ Extract a vector from src1-src2 and src3-src4, andmultiply-accumulate
+@ into dst1 and dst3 (or dst1-dst2 and dst3-dst4 for size >= 8)
+.macro extmlal dst1, dst2, dst3, dst4, src1, src2, src3, src4, offset, size
+ vext.8 q14, \src1, \src2, #(2*\offset)
+ vext.8 q15, \src3, \src4, #(2*\offset)
+ vmlal_lane \dst1, d28, \offset
+ vmlal_lane \dst3, d30, \offset
+.if \size >= 8
+ vmlal_lane \dst2, d29, \offset
+ vmlal_lane \dst4, d31, \offset
+.endif
+.endm
+
+
+@ Instantiate a horizontal filter function for the given size.
+@ This can work on 4 or 8 pixels in parallel; for larger
+@ widths it will do 8 pixels at a time and loop horizontally.
+@ The actual width (in bytes) is passed in r5, the height in r4 and
+@ the filter coefficients in r12.
+.macro do_8tap_h type, size
+function \type\()_8tap_\size\()h
+ sub r2, r2, #6
+ add r6, r0, r1
+ add r7, r2, r3
+ add r1, r1, r1
+ add r3, r3, r3
+ @ Only size >= 8 loops horizontally and needs
+ @ reduced dst stride
+.if \size >= 8
+ sub r1, r1, r5
+.endif
+ @ size >= 8 loads two qwords and increments r2,
+ @ for size 4 it's enough with three dwords and no
+ @ postincrement
+.if \size >= 8
+ sub r3, r3, r5
+ sub r3, r3, #16
+.endif
+ @ Load the filter vector
+ vld1.16 {q0}, [r12,:128]
+1:
+.if \size >= 8
+ mov r12, r5
+.endif
+ @ Load src
+.if \size >= 8
+ vld1.16 {q8, q9}, [r2]!
+ vld1.16 {q10, q11}, [r7]!
+.else
+ vld1.16 {d16, d17, d18}, [r2]
+ vld1.16 {d20, d21, d22}, [r7]
+.endif
+2:
+
+ vmull.s16 q1, d16, d0[0]
+ vmull.s16 q12, d20, d0[0]
+.if \size >= 8
+ vmull.s16 q2, d17, d0[0]
+ vmull.s16 q13, d21, d0[0]
+.endif
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 1, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 2, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 3, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 4, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 5, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 6, \size
+ extmlal q1, q2, q12, q13, q8, q9, q10, q11, 7, \size
+
+ @ Round, shift and saturate.
+ @ The vqrshrun takes care of clamping negative values to zero, but
+ @ we manually need to do vmin with the max pixel value.
+ vqrshrun.s32 d2, q1, #7
+ vqrshrun.s32 d24, q12, #7
+.if \size >= 8
+ vqrshrun.s32 d3, q2, #7
+ vqrshrun.s32 d25, q13, #7
+ vmin.u16 q1, q1, q3
+ vmin.u16 q12, q12, q3
+.else
+ vmin.u16 d2, d2, d6
+ vmin.u16 d24, d24, d6
+.endif
+ @ Average
+.ifc \type,avg
+.if \size >= 8
+ vld1.16 {q14}, [r0,:128]
+ vld1.16 {q15}, [r6,:128]
+ vrhadd.u16 q1, q1, q14
+ vrhadd.u16 q12, q12, q15
+.else
+ vld1.16 {d28}, [r0,:64]
+ vld1.16 {d30}, [r6,:64]
+ vrhadd.u16 d2, d2, d28
+ vrhadd.u16 d24, d24, d30
+.endif
+.endif
+ @ Store and loop horizontally (for size >= 8)
+.if \size >= 8
+ subs r12, r12, #16
+ vst1.16 {q1}, [r0,:128]!
+ vst1.16 {q12}, [r6,:128]!
+ beq 3f
+ vmov q8, q9
+ vmov q10, q11
+ vld1.16 {q9}, [r2]!
+ vld1.16 {q11}, [r7]!
+ b 2b
+.else @ \size == 4
+ vst1.16 {d2}, [r0,:64]
+ vst1.16 {d24}, [r6,:64]
+.endif
+3:
+ @ Loop vertically
+ add r0, r0, r1
+ add r6, r6, r1
+ add r2, r2, r3
+ add r7, r7, r3
+ subs r4, r4, #2
+ bne 1b
+ pop {r4-r7}
+ bx lr
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size
+do_8tap_h avg, \size
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+
+.macro do_8tap_h_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
+ push {r4-r7}
+ ldr r4, [sp, #16]
+ ldr r5, [sp, #20]
+ vmvn.u16 q3, #((0xffff << \bpp) & 0xffff)
+ movrelx r12, X(ff_vp9_subpel_filters), r6
+ add r12, r12, 256*\offset
+ add r12, r12, r5, lsl #4
+ mov r5, #2*\size
+.if \size >= 8
+ b \type\()_8tap_8h
+.else
+ b \type\()_8tap_4h
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size, bpp
+do_8tap_h_func put, regular, 1, \size, \bpp
+do_8tap_h_func avg, regular, 1, \size, \bpp
+do_8tap_h_func put, sharp, 2, \size, \bpp
+do_8tap_h_func avg, sharp, 2, \size, \bpp
+do_8tap_h_func put, smooth, 0, \size, \bpp
+do_8tap_h_func avg, smooth, 0, \size, \bpp
+.endm
+
+.macro do_8tap_h_filters_bpp bpp
+do_8tap_h_filters 64, \bpp
+do_8tap_h_filters 32, \bpp
+do_8tap_h_filters 16, \bpp
+do_8tap_h_filters 8, \bpp
+do_8tap_h_filters 4, \bpp
+.endm
+
+do_8tap_h_filters_bpp 10
+do_8tap_h_filters_bpp 12
+
+.ltorg
+
+@ Vertical filters
+
+@ Round, shift and saturate and store qreg1-4
+.macro do_store4 qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, minreg, type
+ vqrshrun.s32 \dreg1, \qreg1, #7
+ vqrshrun.s32 \dreg2, \qreg2, #7
+ vqrshrun.s32 \dreg3, \qreg3, #7
+ vqrshrun.s32 \dreg4, \qreg4, #7
+.ifc \type,avg
+ vld1.16 {\tmp1}, [r6,:64], r1
+ vld1.16 {\tmp2}, [r6,:64], r1
+ vld1.16 {\tmp3}, [r6,:64], r1
+ vld1.16 {\tmp4}, [r6,:64], r1
+.endif
+ vmin.u16 \dreg1, \dreg1, \minreg
+ vmin.u16 \dreg2, \dreg2, \minreg
+ vmin.u16 \dreg3, \dreg3, \minreg
+ vmin.u16 \dreg4, \dreg4, \minreg
+.ifc \type,avg
+ vrhadd.u16 \dreg1, \dreg1, \tmp1
+ vrhadd.u16 \dreg2, \dreg2, \tmp2
+ vrhadd.u16 \dreg3, \dreg3, \tmp3
+ vrhadd.u16 \dreg4, \dreg4, \tmp4
+.endif
+ vst1.16 {\dreg1}, [r0,:64], r1
+ vst1.16 {\dreg2}, [r0,:64], r1
+ vst1.16 {\dreg3}, [r0,:64], r1
+ vst1.16 {\dreg4}, [r0,:64], r1
+.endm
+
+@ Round, shift and saturate and store qreg1-4
+@ qreg1-2 belong to one line and qreg3-4 to the second line.
+@ dreg1-2 == qreg1, dreg3-4 == qreg2.
+.macro do_store8 qreg1, qreg2, qreg3, qreg4, dreg1, dreg2, dreg3, dreg4, minreg, type
+ vqrshrun.s32 \dreg1, \qreg1, #7
+ vqrshrun.s32 \dreg2, \qreg2, #7
+ vqrshrun.s32 \dreg3, \qreg3, #7
+ vqrshrun.s32 \dreg4, \qreg4, #7
+.ifc \type,avg
+ vld1.16 {\qreg3}, [r6,:128], r1
+ vld1.16 {\qreg4}, [r6,:128], r1
+.endif
+ vmin.u16 \qreg1, \qreg1, \minreg
+ vmin.u16 \qreg2, \qreg2, \minreg
+.ifc \type,avg
+ vrhadd.u16 \qreg1, \qreg1, \qreg3
+ vrhadd.u16 \qreg2, \qreg2, \qreg4
+.endif
+ vst1.16 {\qreg1}, [r0,:128], r1
+ vst1.16 {\qreg2}, [r0,:128], r1
+.endm
+
+@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+@ (src1-src8 into dst1, src2-src9 into dst2).
+.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
+ vmull.s16 \dst1, \src1, d0[0]
+ vmull.s16 \dst2, \src2, d0[0]
+ vmull.s16 \tmp1, \src2, d0[1]
+ vmull.s16 \tmp2, \src3, d0[1]
+ vmlal.s16 \dst1, \src3, d0[2]
+ vmlal.s16 \dst2, \src4, d0[2]
+ vmlal.s16 \tmp1, \src4, d0[3]
+ vmlal.s16 \tmp2, \src5, d0[3]
+ vmlal.s16 \dst1, \src5, d1[0]
+ vmlal.s16 \dst2, \src6, d1[0]
+ vmlal.s16 \tmp1, \src6, d1[1]
+ vmlal.s16 \tmp2, \src7, d1[1]
+ vmlal.s16 \dst1, \src7, d1[2]
+ vmlal.s16 \dst2, \src8, d1[2]
+ vmlal.s16 \tmp1, \src8, d1[3]
+ vmlal.s16 \tmp2, \src9, d1[3]
+ vadd.s32 \dst1, \dst1, \tmp1
+ vadd.s32 \dst2, \dst2, \tmp2
+.endm
+
+@ Evaluate the filter twice in parallel. This does the same as convolve4 above,
+@ but with double width (two input/output registers per row).
+.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15, src16, src17, src18
+ vmull.s16 \dst1, \src1, d0[0]
+ vmull.s16 \dst2, \src2, d0[0]
+ vmull.s16 \dst3, \src3, d0[0]
+ vmull.s16 \dst4, \src4, d0[0]
+ vmlal.s16 \dst1, \src3, d0[1]
+ vmlal.s16 \dst2, \src4, d0[1]
+ vmlal.s16 \dst3, \src5, d0[1]
+ vmlal.s16 \dst4, \src6, d0[1]
+ vmlal.s16 \dst1, \src5, d0[2]
+ vmlal.s16 \dst2, \src6, d0[2]
+ vmlal.s16 \dst3, \src7, d0[2]
+ vmlal.s16 \dst4, \src8, d0[2]
+ vmlal.s16 \dst1, \src7, d0[3]
+ vmlal.s16 \dst2, \src8, d0[3]
+ vmlal.s16 \dst3, \src9, d0[3]
+ vmlal.s16 \dst4, \src10, d0[3]
+ vmlal.s16 \dst1, \src9, d1[0]
+ vmlal.s16 \dst2, \src10, d1[0]
+ vmlal.s16 \dst3, \src11, d1[0]
+ vmlal.s16 \dst4, \src12, d1[0]
+ vmlal.s16 \dst1, \src11, d1[1]
+ vmlal.s16 \dst2, \src12, d1[1]
+ vmlal.s16 \dst3, \src13, d1[1]
+ vmlal.s16 \dst4, \src14, d1[1]
+ vmlal.s16 \dst1, \src13, d1[2]
+ vmlal.s16 \dst2, \src14, d1[2]
+ vmlal.s16 \dst3, \src15, d1[2]
+ vmlal.s16 \dst4, \src16, d1[2]
+ vmlal.s16 \dst1, \src15, d1[3]
+ vmlal.s16 \dst2, \src16, d1[3]
+ vmlal.s16 \dst3, \src17, d1[3]
+ vmlal.s16 \dst4, \src18, d1[3]
+.endm
+
+@ Instantiate a vertical filter function for filtering 8 pixels at a time.
+@ The height is passed in r4, the width in r5 and the filter coefficients
+@ in r12.
+.macro do_8tap_8v type
+function \type\()_8tap_8v
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ vld1.16 {q0}, [r12, :128]
+1:
+.ifc \type,avg
+ mov r6, r0
+.endif
+ mov r12, r4
+
+ vld1.16 {q5}, [r2], r3
+ vld1.16 {q6}, [r2], r3
+ vld1.16 {q7}, [r2], r3
+ vld1.16 {q8}, [r2], r3
+ vld1.16 {q9}, [r2], r3
+ vld1.16 {q10}, [r2], r3
+ vld1.16 {q11}, [r2], r3
+2:
+ vld1.16 {q12}, [r2], r3
+ vld1.16 {q13}, [r2], r3
+ vld1.16 {q14}, [r2], r3
+ vld1.16 {q15}, [r2], r3
+ convolve8 q2, q3, q4, q5, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27
+ do_store8 q2, q3, q4, q5, d4, d5, d6, d7, q1, \type
+ convolve8 q2, q3, q4, q5, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+ do_store8 q2, q3, q4, q5, d4, d5, d6, d7, q1, \type
+
+ subs r12, r12, #4
+ beq 8f
+
+ vld1.16 {q4}, [r2], r3
+ vld1.16 {q5}, [r2], r3
+ vld1.16 {q6}, [r2], r3
+ vld1.16 {q7}, [r2], r3
+ convolve8 q2, q3, q8, q9, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11
+ do_store8 q2, q3, q8, q9, d4, d5, d6, d7, q1, \type
+ convolve8 q2, q3, q8, q9, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15
+ do_store8 q2, q3, q8, q9, d4, d5, d6, d7, q1, \type
+
+ subs r12, r12, #4
+ beq 8f
+
+ vld1.16 {q8}, [r2], r3
+ vld1.16 {q9}, [r2], r3
+ vld1.16 {q10}, [r2], r3
+ vld1.16 {q11}, [r2], r3
+ convolve8 q2, q3, q12, q13, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19
+ do_store8 q2, q3, q12, q13, d4, d5, d6, d7, q1, \type
+ convolve8 q2, q3, q12, q13, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23
+ do_store8 q2, q3, q12, q13, d4, d5, d6, d7, q1, \type
+
+ subs r12, r12, #4
+ bne 2b
+
+8:
+ subs r5, r5, #8
+ beq 9f
+ @ r0 -= h * dst_stride
+ mls r0, r1, r4, r0
+ @ r2 -= h * src_stride
+ mls r2, r3, r4, r2
+ @ r2 -= 8 * src_stride
+ sub r2, r2, r3, lsl #3
+ @ r2 += 1 * src_stride
+ add r2, r2, r3
+ add r2, r2, #16
+ add r0, r0, #16
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r6}
+ bx lr
+endfunc
+.endm
+
+do_8tap_8v put
+do_8tap_8v avg
+
+@ Instantiate a vertical filter function for filtering a 4 pixels wide
+@ slice. This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type
+function \type\()_8tap_4v
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ vld1.16 {q0}, [r12, :128]
+.ifc \type,avg
+ mov r6, r0
+.endif
+
+ vld1.16 {d16}, [r2], r3
+ vld1.16 {d17}, [r2], r3
+ vld1.16 {d18}, [r2], r3
+ vld1.16 {d19}, [r2], r3
+ vld1.16 {d20}, [r2], r3
+ vld1.16 {d21}, [r2], r3
+ vld1.16 {d22}, [r2], r3
+ vld1.16 {d23}, [r2], r3
+ vld1.16 {d24}, [r2], r3
+ vld1.16 {d25}, [r2], r3
+ vld1.16 {d26}, [r2], r3
+ convolve4 q2, q3, d16, d17, d18, d19, d20, d21, d22, d23, d24, q14, q15
+ convolve4 q14, q15, d18, d19, d20, d21, d22, d23, d24, d25, d26, q8, q9
+ do_store4 q2, d4, q3, d6, q14, d28, q15, d30, d5, d7, d29, d31, d2, \type
+
+ subs r4, r4, #4
+ beq 9f
+
+ vld1.16 {d27}, [r2], r3
+ vld1.16 {d28}, [r2], r3
+ vld1.16 {d29}, [r2], r3
+ vld1.16 {d30}, [r2], r3
+ convolve4 q2, q3, d20, d21, d22, d23, d24, d25, d26, d27, d28, q8, q9
+ convolve4 q8, q9, d22, d23, d24, d25, d26, d27, d28, d29, d30, q10, q11
+ do_store4 q2, d4, q3, d6, q8, d16, q9, d18, d5, d7, d17, d19, d2, \type
+
+9:
+ pop {r4-r6}
+ bx lr
+endfunc
+.endm
+
+do_8tap_4v put
+do_8tap_4v avg
+
+.macro do_8tap_v_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
+ push {r4-r6}
+ ldr r4, [sp, #12]
+ ldr r5, [sp, #20]
+.if \size >= 8
+ vpush {q4-q7}
+.endif
+ vmvn.u16 q1, #((0xffff << \bpp) & 0xffff)
+ movrelx r12, X(ff_vp9_subpel_filters), r6
+ add r12, r12, 256*\offset
+ add r12, r12, r5, lsl #4
+ mov r5, #\size
+.if \size >= 8
+ b \type\()_8tap_8v
+.else
+ b \type\()_8tap_4v
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size, bpp
+do_8tap_v_func put, regular, 1, \size, \bpp
+do_8tap_v_func avg, regular, 1, \size, \bpp
+do_8tap_v_func put, sharp, 2, \size, \bpp
+do_8tap_v_func avg, sharp, 2, \size, \bpp
+do_8tap_v_func put, smooth, 0, \size, \bpp
+do_8tap_v_func avg, smooth, 0, \size, \bpp
+.endm
+
+.macro do_8tap_v_filters_bpp bpp
+do_8tap_v_filters 64, \bpp
+do_8tap_v_filters 32, \bpp
+do_8tap_v_filters 16, \bpp
+do_8tap_v_filters 8, \bpp
+do_8tap_v_filters 4, \bpp
+.endm
+
+do_8tap_v_filters_bpp 10
+do_8tap_v_filters_bpp 12
diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
new file mode 100644
index 0000000000..bd8cda7c30
--- /dev/null
+++ b/libavcodec/arm/vp9mc_neon.S
@@ -0,0 +1,720 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ All public functions in this file have the following signature:
+@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+@ const uint8_t *ref, ptrdiff_t ref_stride,
+@ int h, int mx, int my);
+
+function ff_vp9_copy64_neon, export=1
+ ldr r12, [sp]
+ sub r1, r1, #32
+ sub r3, r3, #32
+1:
+ vld1.8 {q0, q1}, [r2]!
+ vst1.8 {q0, q1}, [r0, :128]!
+ vld1.8 {q2, q3}, [r2], r3
+ subs r12, r12, #1
+ vst1.8 {q2, q3}, [r0, :128], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg64_neon, export=1
+ push {lr}
+ ldr r12, [sp, #4]
+ sub r1, r1, #32
+ sub r3, r3, #32
+ mov lr, r0
+1:
+ vld1.8 {q8, q9}, [r2]!
+ vld1.8 {q0, q1}, [r0, :128]!
+ vld1.8 {q10, q11}, [r2], r3
+ vrhadd.u8 q0, q0, q8
+ vld1.8 {q2, q3}, [r0, :128], r1
+ vrhadd.u8 q1, q1, q9
+ vrhadd.u8 q2, q2, q10
+ vst1.8 {q0, q1}, [lr, :128]!
+ vrhadd.u8 q3, q3, q11
+ vst1.8 {q2, q3}, [lr, :128], r1
+ subs r12, r12, #1
+ bne 1b
+ pop {pc}
+endfunc
+
+function ff_vp9_copy32_neon, export=1
+ ldr r12, [sp]
+1:
+ vld1.8 {q0, q1}, [r2], r3
+ subs r12, r12, #1
+ vst1.8 {q0, q1}, [r0, :128], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg32_neon, export=1
+ ldr r12, [sp]
+1:
+ vld1.8 {q2, q3}, [r2], r3
+ vld1.8 {q0, q1}, [r0, :128]
+ vrhadd.u8 q0, q0, q2
+ vrhadd.u8 q1, q1, q3
+ subs r12, r12, #1
+ vst1.8 {q0, q1}, [r0, :128], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_copy16_neon, export=1
+ push {r4,lr}
+ ldr r12, [sp, #8]
+ add r4, r0, r1
+ add lr, r2, r3
+ add r1, r1, r1
+ add r3, r3, r3
+1:
+ vld1.8 {q0}, [r2], r3
+ vld1.8 {q1}, [lr], r3
+ subs r12, r12, #2
+ vst1.8 {q0}, [r0, :128], r1
+ vst1.8 {q1}, [r4, :128], r1
+ bne 1b
+ pop {r4,pc}
+endfunc
+
+function ff_vp9_avg16_neon, export=1
+ push {lr}
+ ldr r12, [sp, #4]
+ mov lr, r0
+1:
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q0}, [r0, :128], r1
+ vld1.8 {q3}, [r2], r3
+ vrhadd.u8 q0, q0, q2
+ vld1.8 {q1}, [r0, :128], r1
+ vrhadd.u8 q1, q1, q3
+ subs r12, r12, #2
+ vst1.8 {q0}, [lr, :128], r1
+ vst1.8 {q1}, [lr, :128], r1
+ bne 1b
+ pop {pc}
+endfunc
+
+function ff_vp9_copy8_neon, export=1
+ ldr r12, [sp]
+1:
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r2], r3
+ subs r12, r12, #2
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d1}, [r0, :64], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg8_neon, export=1
+ ldr r12, [sp]
+1:
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d0}, [r0, :64], r1
+ vld1.8 {d3}, [r2], r3
+ vrhadd.u8 d0, d0, d2
+ vld1.8 {d1}, [r0, :64]
+ sub r0, r0, r1
+ vrhadd.u8 d1, d1, d3
+ subs r12, r12, #2
+ vst1.8 {d0}, [r0, :64], r1
+ vst1.8 {d1}, [r0, :64], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_copy4_neon, export=1
+ ldr r12, [sp]
+1:
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ vst1.32 {d0[0]}, [r0, :32], r1
+ vld1.32 {d2[]}, [r2], r3
+ vst1.32 {d1[0]}, [r0, :32], r1
+ vld1.32 {d3[]}, [r2], r3
+ subs r12, r12, #4
+ vst1.32 {d2[0]}, [r0, :32], r1
+ vst1.32 {d3[0]}, [r0, :32], r1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_vp9_avg4_neon, export=1
+ push {lr}
+ ldr r12, [sp, #4]
+ mov lr, r0
+1:
+ vld1.32 {d4[]}, [r2], r3
+ vld1.32 {d0[]}, [r0, :32], r1
+ vld1.32 {d5[]}, [r2], r3
+ vrhadd.u8 d0, d0, d4
+ vld1.32 {d1[]}, [r0, :32], r1
+ vld1.32 {d6[]}, [r2], r3
+ vrhadd.u8 d1, d1, d5
+ vld1.32 {d2[]}, [r0, :32], r1
+ vld1.32 {d7[]}, [r2], r3
+ vrhadd.u8 d2, d2, d6
+ vld1.32 {d3[]}, [r0, :32], r1
+ subs r12, r12, #4
+ vst1.32 {d0[0]}, [lr, :32], r1
+ vrhadd.u8 d3, d3, d7
+ vst1.32 {d1[0]}, [lr, :32], r1
+ vst1.32 {d2[0]}, [lr, :32], r1
+ vst1.32 {d3[0]}, [lr, :32], r1
+ bne 1b
+ pop {pc}
+endfunc
+
+@ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index
+.macro vmul_lane dst, src, idx
+.if \idx < 4
+ vmul.s16 \dst, \src, d0[\idx]
+.else
+ vmul.s16 \dst, \src, d1[\idx - 4]
+.endif
+.endm
+.macro vmla_lane dst, src, idx
+.if \idx < 4
+ vmla.s16 \dst, \src, d0[\idx]
+.else
+ vmla.s16 \dst, \src, d1[\idx - 4]
+.endif
+.endm
+
+@ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
+@ for size >= 16), and multiply-accumulate into dst1 and dst3 (or
+@ dst1-dst2 and dst3-dst4 for size >= 16)
+.macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
+ vext.8 q14, \src1, \src2, #(2*\offset)
+ vext.8 q15, \src4, \src5, #(2*\offset)
+.if \size >= 16
+ vmla_lane \dst1, q14, \offset
+ vext.8 q5, \src2, \src3, #(2*\offset)
+ vmla_lane \dst3, q15, \offset
+ vext.8 q6, \src5, \src6, #(2*\offset)
+ vmla_lane \dst2, q5, \offset
+ vmla_lane \dst4, q6, \offset
+.elseif \size == 8
+ vmla_lane \dst1, q14, \offset
+ vmla_lane \dst3, q15, \offset
+.else
+ vmla_lane \dst1d, d28, \offset
+ vmla_lane \dst3d, d30, \offset
+.endif
+.endm
+@ The same as above, but don't accumulate straight into the
+@ destination, but use a temp register and accumulate with saturation.
+.macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
+ vext.8 q14, \src1, \src2, #(2*\offset)
+ vext.8 q15, \src4, \src5, #(2*\offset)
+.if \size >= 16
+ vmul_lane q14, q14, \offset
+ vext.8 q5, \src2, \src3, #(2*\offset)
+ vmul_lane q15, q15, \offset
+ vext.8 q6, \src5, \src6, #(2*\offset)
+ vmul_lane q5, q5, \offset
+ vmul_lane q6, q6, \offset
+.elseif \size == 8
+ vmul_lane q14, q14, \offset
+ vmul_lane q15, q15, \offset
+.else
+ vmul_lane d28, d28, \offset
+ vmul_lane d30, d30, \offset
+.endif
+.if \size == 4
+ vqadd.s16 \dst1d, \dst1d, d28
+ vqadd.s16 \dst3d, \dst3d, d30
+.else
+ vqadd.s16 \dst1, \dst1, q14
+ vqadd.s16 \dst3, \dst3, q15
+.if \size >= 16
+ vqadd.s16 \dst2, \dst2, q5
+ vqadd.s16 \dst4, \dst4, q6
+.endif
+.endif
+.endm
+
+
+@ Instantiate a horizontal filter function for the given size.
+@ This can work on 4, 8 or 16 pixels in parallel; for larger
+@ widths it will do 16 pixels at a time and loop horizontally.
+@ The actual width is passed in r5, the height in r4 and
+@ the filter coefficients in r12. idx2 is the index of the largest
+@ filter coefficient (3 or 4) and idx1 is the other one of them.
+.macro do_8tap_h type, size, idx1, idx2
+function \type\()_8tap_\size\()h_\idx1\idx2
+ sub r2, r2, #3
+ add r6, r0, r1
+ add r7, r2, r3
+ add r1, r1, r1
+ add r3, r3, r3
+ @ Only size >= 16 loops horizontally and needs
+ @ reduced dst stride
+.if \size >= 16
+ sub r1, r1, r5
+.endif
+ @ size >= 16 loads two qwords and increments r2,
+ @ for size 4/8 it's enough with one qword and no
+ @ postincrement
+.if \size >= 16
+ sub r3, r3, r5
+ sub r3, r3, #8
+.endif
+ @ Load the filter vector
+ vld1.16 {q0}, [r12,:128]
+1:
+.if \size >= 16
+ mov r12, r5
+.endif
+ @ Load src
+.if \size >= 16
+ vld1.8 {d18, d19, d20}, [r2]!
+ vld1.8 {d24, d25, d26}, [r7]!
+.else
+ vld1.8 {q9}, [r2]
+ vld1.8 {q12}, [r7]
+.endif
+ vmovl.u8 q8, d18
+ vmovl.u8 q9, d19
+ vmovl.u8 q11, d24
+ vmovl.u8 q12, d25
+.if \size >= 16
+ vmovl.u8 q10, d20
+ vmovl.u8 q13, d26
+.endif
+2:
+
+ @ Accumulate, adding idx2 last with a separate
+ @ saturating add. The positive filter coefficients
+ @ for all indices except idx2 must add up to less
+ @ than 127 for this not to overflow.
+ vmul.s16 q1, q8, d0[0]
+ vmul.s16 q3, q11, d0[0]
+.if \size >= 16
+ vmul.s16 q2, q9, d0[0]
+ vmul.s16 q4, q12, d0[0]
+.endif
+ extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 1, \size
+ extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 2, \size
+ extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx1, \size
+ extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 5, \size
+ extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 6, \size
+ extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 7, \size
+ extmulqadd q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx2, \size
+
+ @ Round, shift and saturate
+ vqrshrun.s16 d2, q1, #7
+ vqrshrun.s16 d6, q3, #7
+.if \size >= 16
+ vqrshrun.s16 d3, q2, #7
+ vqrshrun.s16 d7, q4, #7
+.endif
+ @ Average
+.ifc \type,avg
+.if \size >= 16
+ vld1.8 {q14}, [r0,:128]
+ vld1.8 {q15}, [r6,:128]
+ vrhadd.u8 q1, q1, q14
+ vrhadd.u8 q3, q3, q15
+.elseif \size == 8
+ vld1.8 {d28}, [r0,:64]
+ vld1.8 {d30}, [r6,:64]
+ vrhadd.u8 d2, d2, d28
+ vrhadd.u8 d6, d6, d30
+.else
+ @ We only need d28[0], but [] is faster on some cores
+ vld1.32 {d28[]}, [r0,:32]
+ vld1.32 {d30[]}, [r6,:32]
+ vrhadd.u8 d2, d2, d28
+ vrhadd.u8 d6, d6, d30
+.endif
+.endif
+ @ Store and loop horizontally (for size >= 16)
+.if \size >= 16
+ subs r12, r12, #16
+ vst1.8 {q1}, [r0,:128]!
+ vst1.8 {q3}, [r6,:128]!
+ beq 3f
+ vmov q8, q10
+ vmov q11, q13
+ vld1.8 {q10}, [r2]!
+ vld1.8 {q13}, [r7]!
+ vmovl.u8 q9, d20
+ vmovl.u8 q10, d21
+ vmovl.u8 q12, d26
+ vmovl.u8 q13, d27
+ b 2b
+.elseif \size == 8
+ vst1.8 {d2}, [r0,:64]
+ vst1.8 {d6}, [r6,:64]
+.else @ \size == 4
+ vst1.32 {d2[0]}, [r0,:32]
+ vst1.32 {d6[0]}, [r6,:32]
+.endif
+3:
+ @ Loop vertically
+ add r0, r0, r1
+ add r6, r6, r1
+ add r2, r2, r3
+ add r7, r7, r3
+ subs r4, r4, #2
+ bne 1b
+.if \size >= 16
+ vpop {q4-q6}
+.endif
+ pop {r4-r7}
+ bx lr
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size, 3, 4
+do_8tap_h avg, \size, 3, 4
+do_8tap_h put, \size, 4, 3
+do_8tap_h avg, \size, 4, 3
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+do_8tap_h_size 16
+
+.macro do_8tap_h_func type, filter, offset, size
+function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
+ push {r4-r7}
+.if \size >= 16
+ vpush {q4-q6}
+ ldr r4, [sp, #64]
+ ldr r5, [sp, #68]
+.else
+ ldr r4, [sp, #16]
+ ldr r5, [sp, #20]
+.endif
+ movrelx r12, X(ff_vp9_subpel_filters), r6
+ add r12, r12, 256*\offset
+ cmp r5, #8
+ add r12, r12, r5, lsl #4
+ mov r5, #\size
+.if \size >= 16
+ bge \type\()_8tap_16h_34
+ b \type\()_8tap_16h_43
+.else
+ bge \type\()_8tap_\size\()h_34
+ b \type\()_8tap_\size\()h_43
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size
+do_8tap_h_func put, regular, 1, \size
+do_8tap_h_func avg, regular, 1, \size
+do_8tap_h_func put, sharp, 2, \size
+do_8tap_h_func avg, sharp, 2, \size
+do_8tap_h_func put, smooth, 0, \size
+do_8tap_h_func avg, smooth, 0, \size
+.endm
+
+do_8tap_h_filters 64
+do_8tap_h_filters 32
+do_8tap_h_filters 16
+do_8tap_h_filters 8
+do_8tap_h_filters 4
+
+.ltorg
+
+@ Vertical filters
+
+@ Round, shift and saturate and store qreg1-2 over 4 lines
+.macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type
+ vqrshrun.s16 \dreg1, \qreg1, #7
+ vqrshrun.s16 \dreg2, \qreg2, #7
+.ifc \type,avg
+ vld1.32 {\tmp1[]}, [r0,:32], r1
+ vld1.32 {\tmp2[]}, [r0,:32], r1
+ vld1.32 {\tmp1[1]}, [r0,:32], r1
+ vld1.32 {\tmp2[1]}, [r0,:32], r1
+ vrhadd.u8 \dreg1, \dreg1, \tmp1
+ vrhadd.u8 \dreg2, \dreg2, \tmp2
+ sub r0, r0, r1, lsl #2
+.endif
+ vst1.32 {\dreg1[0]}, [r0,:32], r1
+ vst1.32 {\dreg2[0]}, [r0,:32], r1
+ vst1.32 {\dreg1[1]}, [r0,:32], r1
+ vst1.32 {\dreg2[1]}, [r0,:32], r1
+.endm
+
+@ Round, shift and saturate and store qreg1-4
+.macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type
+ vqrshrun.s16 \dreg1, \qreg1, #7
+ vqrshrun.s16 \dreg2, \qreg2, #7
+ vqrshrun.s16 \dreg3, \qreg3, #7
+ vqrshrun.s16 \dreg4, \qreg4, #7
+.ifc \type,avg
+ vld1.8 {\tmp1}, [r0,:64], r1
+ vld1.8 {\tmp2}, [r0,:64], r1
+ vld1.8 {\tmp3}, [r0,:64], r1
+ vld1.8 {\tmp4}, [r0,:64], r1
+ vrhadd.u8 \dreg1, \dreg1, \tmp1
+ vrhadd.u8 \dreg2, \dreg2, \tmp2
+ vrhadd.u8 \dreg3, \dreg3, \tmp3
+ vrhadd.u8 \dreg4, \dreg4, \tmp4
+ sub r0, r0, r1, lsl #2
+.endif
+ vst1.8 {\dreg1}, [r0,:64], r1
+ vst1.8 {\dreg2}, [r0,:64], r1
+ vst1.8 {\dreg3}, [r0,:64], r1
+ vst1.8 {\dreg4}, [r0,:64], r1
+.endm
+
+@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+@ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
+@ at the end with saturation. Indices 0 and 7 always have negative or zero
+@ coefficients, so they can be accumulated into tmp1-tmp2 together with the
+@ largest coefficient.
+.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
+ vmul.s16 \dst1, \src2, d0[1]
+ vmul.s16 \dst2, \src3, d0[1]
+ vmul.s16 \tmp1, \src1, d0[0]
+ vmul.s16 \tmp2, \src2, d0[0]
+ vmla.s16 \dst1, \src3, d0[2]
+ vmla.s16 \dst2, \src4, d0[2]
+.if \idx1 == 3
+ vmla.s16 \dst1, \src4, d0[3]
+ vmla.s16 \dst2, \src5, d0[3]
+.else
+ vmla.s16 \dst1, \src5, d1[0]
+ vmla.s16 \dst2, \src6, d1[0]
+.endif
+ vmla.s16 \dst1, \src6, d1[1]
+ vmla.s16 \dst2, \src7, d1[1]
+ vmla.s16 \tmp1, \src8, d1[3]
+ vmla.s16 \tmp2, \src9, d1[3]
+ vmla.s16 \dst1, \src7, d1[2]
+ vmla.s16 \dst2, \src8, d1[2]
+.if \idx2 == 3
+ vmla.s16 \tmp1, \src4, d0[3]
+ vmla.s16 \tmp2, \src5, d0[3]
+.else
+ vmla.s16 \tmp1, \src5, d1[0]
+ vmla.s16 \tmp2, \src6, d1[0]
+.endif
+ vqadd.s16 \dst1, \dst1, \tmp1
+ vqadd.s16 \dst2, \dst2, \tmp2
+.endm
+
+@ Load pixels and extend them to 16 bit
+.macro loadl dst1, dst2, dst3, dst4
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r2], r3
+ vld1.8 {d4}, [r2], r3
+.ifnb \dst4
+ vld1.8 {d5}, [r2], r3
+.endif
+ vmovl.u8 \dst1, d2
+ vmovl.u8 \dst2, d3
+ vmovl.u8 \dst3, d4
+.ifnb \dst4
+ vmovl.u8 \dst4, d5
+.endif
+.endm
+
+@ Instantiate a vertical filter function for filtering 8 pixels at a time.
+@ The height is passed in r4, the width in r5 and the filter coefficients
+@ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
+@ and idx1 is the other one of them.
+.macro do_8tap_8v type, idx1, idx2
+function \type\()_8tap_8v_\idx1\idx2
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ vld1.16 {q0}, [r12, :128]
+1:
+ mov r12, r4
+
+ loadl q5, q6, q7
+ loadl q8, q9, q10, q11
+2:
+ loadl q12, q13, q14, q15
+ convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q5
+ convolve q3, q4, q7, q8, q9, q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5, q6
+ do_store q1, d2, q2, d4, q3, d6, q4, d8, d3, d5, d7, d9, \type
+
+ subs r12, r12, #4
+ beq 8f
+
+ loadl q4, q5, q6, q7
+ convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q4, q5, \idx1, \idx2, q8, q9
+ convolve q3, q8, q11, q12, q13, q14, q15, q4, q5, q6, q7, \idx1, \idx2, q9, q10
+ do_store q1, d2, q2, d4, q3, d6, q8, d16, d3, d5, d7, d17, \type
+
+ subs r12, r12, #4
+ beq 8f
+
+ loadl q8, q9, q10, q11
+ convolve q1, q2, q13, q14, q15, q4, q5, q6, q7, q8, q9, \idx1, \idx2, q12, q13
+ convolve q3, q12, q15, q4, q5, q6, q7, q8, q9, q10, q11, \idx1, \idx2, q13, q14
+ do_store q1, d2, q2, d4, q3, d6, q12, d24, d3, d5, d7, d25, \type
+
+ subs r12, r12, #4
+ bne 2b
+
+8:
+ subs r5, r5, #8
+ beq 9f
+ @ r0 -= h * dst_stride
+ mls r0, r1, r4, r0
+ @ r2 -= h * src_stride
+ mls r2, r3, r4, r2
+ @ r2 -= 8 * src_stride
+ sub r2, r2, r3, lsl #3
+ @ r2 += 1 * src_stride
+ add r2, r2, r3
+ add r2, r2, #8
+ add r0, r0, #8
+ b 1b
+9:
+ vpop {q4-q7}
+ pop {r4-r5}
+ bx lr
+endfunc
+.endm
+
+do_8tap_8v put, 3, 4
+do_8tap_8v put, 4, 3
+do_8tap_8v avg, 3, 4
+do_8tap_8v avg, 4, 3
+
+@ Instantiate a vertical filter function for filtering a 4 pixels wide
+@ slice. The first half of the registers contain one row, while the second
+@ half of a register contains the second-next row (also stored in the first
+@ half of the register two steps ahead). The convolution does two outputs
+@ at a time; the output of q5-q12 into one, and q4-q13 into another one.
+@ The first half of first output is the first output row, the first half
+@ of the other output is the second output row. The second halves of the
+@ registers are rows 3 and 4.
+@ This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type, idx1, idx2
+function \type\()_8tap_4v_\idx1\idx2
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, r3
+ vld1.16 {q0}, [r12, :128]
+
+ vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d3[]}, [r2], r3
+ vld1.32 {d4[]}, [r2], r3
+ vld1.32 {d5[]}, [r2], r3
+ vld1.32 {d6[]}, [r2], r3
+ vld1.32 {d7[]}, [r2], r3
+ vext.8 d2, d2, d4, #4
+ vld1.32 {d8[]}, [r2], r3
+ vext.8 d3, d3, d5, #4
+ vld1.32 {d9[]}, [r2], r3
+ vmovl.u8 q5, d2
+ vext.8 d4, d4, d6, #4
+ vld1.32 {d28[]}, [r2], r3
+ vmovl.u8 q6, d3
+ vext.8 d5, d5, d7, #4
+ vld1.32 {d29[]}, [r2], r3
+ vmovl.u8 q7, d4
+ vext.8 d6, d6, d8, #4
+ vld1.32 {d30[]}, [r2], r3
+ vmovl.u8 q8, d5
+ vext.8 d7, d7, d9, #4
+ vmovl.u8 q9, d6
+ vext.8 d8, d8, d28, #4
+ vmovl.u8 q10, d7
+ vext.8 d9, d9, d29, #4
+ vmovl.u8 q11, d8
+ vext.8 d28, d28, d30, #4
+ vmovl.u8 q12, d9
+ vmovl.u8 q13, d28
+
+ convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q3
+ do_store4 q1, d2, q2, d4, d3, d5, \type
+ subs r4, r4, #4
+ beq 9f
+
+ vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d3[]}, [r2], r3
+ vext.8 d29, d29, d2, #4
+ vext.8 d30, d30, d3, #4
+ vld1.32 {d2[1]}, [r2], r3
+ vmovl.u8 q14, d29
+ vld1.32 {d3[1]}, [r2], r3
+ vmovl.u8 q15, d30
+ vmovl.u8 q5, d2
+ vmovl.u8 q6, d3
+
+ convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q5, q6, \idx1, \idx2, q4, q3
+ do_store4 q1, d2, q2, d4, d3, d5, \type
+
+9:
+ vpop {q4-q7}
+ pop {r4-r5}
+ bx lr
+endfunc
+.endm
+
+do_8tap_4v put, 3, 4
+do_8tap_4v put, 4, 3
+do_8tap_4v avg, 3, 4
+do_8tap_4v avg, 4, 3
+
+.macro do_8tap_v_func type, filter, offset, size
+function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
+ push {r4-r5}
+ vpush {q4-q7}
+ ldr r4, [sp, #72]
+ movrelx r12, X(ff_vp9_subpel_filters), r5
+ ldr r5, [sp, #80]
+ add r12, r12, 256*\offset
+ add r12, r12, r5, lsl #4
+ cmp r5, #8
+ mov r5, #\size
+.if \size >= 8
+ bge \type\()_8tap_8v_34
+ b \type\()_8tap_8v_43
+.else
+ bge \type\()_8tap_4v_34
+ b \type\()_8tap_4v_43
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size
+do_8tap_v_func put, regular, 1, \size
+do_8tap_v_func avg, regular, 1, \size
+do_8tap_v_func put, sharp, 2, \size
+do_8tap_v_func avg, sharp, 2, \size
+do_8tap_v_func put, smooth, 0, \size
+do_8tap_v_func avg, smooth, 0, \size
+.endm
+
+do_8tap_v_filters 64
+do_8tap_v_filters 32
+do_8tap_v_filters 16
+do_8tap_v_filters 8
+do_8tap_v_filters 4