142 files changed, 10441 insertions, 1267 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index bd4dd4e4ce..1eeac5449e 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -21,8 +21,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += arm/idctdsp_init_arm.o        \
                                           arm/idctdsp_arm.o             \
                                           arm/jrevdct_arm.o             \
                                           arm/simple_idct_arm.o
-OBJS-$(CONFIG_MDCT)                    += arm/mdct_init_arm.o           \
-                                          arm/mdct_fixed_init_arm.o
+OBJS-$(CONFIG_LLAUDDSP)                += arm/lossless_audiodsp_init_arm.o
 OBJS-$(CONFIG_ME_CMP)                  += arm/me_cmp_init_arm.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
@@ -39,12 +38,15 @@ OBJS-$(CONFIG_VP8DSP)                  += arm/vp8dsp_init_arm.o
 # decoders/encoders
 OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
                                           arm/sbrdsp_init_arm.o
-OBJS-$(CONFIG_APE_DECODER)             += arm/apedsp_init_arm.o
-OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o
+OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
+OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
 OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
 OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += arm/vorbisdsp_init_arm.o
 OBJS-$(CONFIG_VP6_DECODER)             += arm/vp6dsp_init_arm.o
+OBJS-$(CONFIG_VP9_DECODER)             += arm/vp9dsp_init_10bpp_arm.o   \
+                                          arm/vp9dsp_init_12bpp_arm.o   \
+                                          arm/vp9dsp_init_arm.o
 
 
 # ARMv5 optimizations
@@ -89,8 +91,7 @@ VFP-OBJS-$(CONFIG_FMTCONVERT)          += arm/fmtconvert_vfp.o
 VFP-OBJS-$(CONFIG_MDCT)                += arm/mdct_vfp.o
 
 # decoders/encoders
-VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/dcadsp_vfp.o              \
-                                          arm/synth_filter_vfp.o
+VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/synth_filter_vfp.o
 
 
 # NEON optimizations
@@ -130,11 +131,20 @@ NEON-OBJS-$(CONFIG_VP8DSP)             += arm/vp8dsp_init_neon.o        \
 # decoders/encoders
 NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
                                           arm/sbrdsp_neon.o
-NEON-OBJS-$(CONFIG_APE_DECODER)        += arm/apedsp_neon.o
-NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
-                                          arm/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+                                          arm/hevcdsp_deblock_neon.o    \
+                                          arm/hevcdsp_idct_neon.o       \
+                                          arm/hevcdsp_qpel_neon.o
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
 NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                           arm/rv40dsp_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)     += arm/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp6dsp_neon.o
+NEON-OBJS-$(CONFIG_VP9_DECODER)        += arm/vp9itxfm_16bpp_neon.o     \
+                                          arm/vp9itxfm_neon.o           \
+                                          arm/vp9lpf_16bpp_neon.o       \
+                                          arm/vp9lpf_neon.o             \
+                                          arm/vp9mc_16bpp_neon.o        \
+                                          arm/vp9mc_neon.o
diff --git a/libavcodec/arm/aac.h b/libavcodec/arm/aac.h
index 4f143cb8a9..cafa881fc7 100644
--- a/libavcodec/arm/aac.h
+++ b/libavcodec/arm/aac.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/aacpsdsp_init_arm.c b/libavcodec/arm/aacpsdsp_init_arm.c
index 6326376004..e04787caae 100644
--- a/libavcodec/arm/aacpsdsp_init_arm.c
+++ b/libavcodec/arm/aacpsdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/aacpsdsp_neon.S b/libavcodec/arm/aacpsdsp_neon.S
index fb00900a4d..a93bbfea9c 100644
--- a/libavcodec/arm/aacpsdsp_neon.S
+++ b/libavcodec/arm/aacpsdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_arm.S b/libavcodec/arm/ac3dsp_arm.S
index ed8eb37845..1aea190de9 100644
--- a/libavcodec/arm/ac3dsp_arm.S
+++ b/libavcodec/arm/ac3dsp_arm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_armv6.S b/libavcodec/arm/ac3dsp_armv6.S
index 2028d0b89f..1d2563d4f7 100644
--- a/libavcodec/arm/ac3dsp_armv6.S
+++ b/libavcodec/arm/ac3dsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/ac3dsp_init_arm.c b/libavcodec/arm/ac3dsp_init_arm.c
index a48353a099..a3c32ff407 100644
--- a/libavcodec/arm/ac3dsp_init_arm.c
+++ b/libavcodec/arm/ac3dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -33,6 +33,14 @@ void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len);
 void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
                                 const int16_t *window, unsigned n);
+void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
+                                            const int32_t *coef0,
+                                            const int32_t *coef1,
+                                            int len);
+void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
+                                            const float *coef0,
+                                            const float *coef1,
+                                            int len);
 
 void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
                                      int start, int end,
@@ -59,5 +67,7 @@ av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
         c->float_to_fixed24      = ff_float_to_fixed24_neon;
         c->extract_exponents     = ff_ac3_extract_exponents_neon;
         c->apply_window_int16    = ff_apply_window_int16_neon;
+        c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
+        c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
     }
 }
diff --git a/libavcodec/arm/ac3dsp_neon.S b/libavcodec/arm/ac3dsp_neon.S
index f97b1907df..89d0ae8048 100644
--- a/libavcodec/arm/ac3dsp_neon.S
+++ b/libavcodec/arm/ac3dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -131,3 +131,47 @@ function ff_apply_window_int16_neon, export=1
 
         pop             {r4,pc}
 endfunc
+
+function ff_ac3_sum_square_butterfly_int32_neon, export=1
+        vmov.i64        q0,  #0
+        vmov.i64        q1,  #0
+        vmov.i64        q2,  #0
+        vmov.i64        q3,  #0
+1:
+        vld1.32         {d16},    [r1]!
+        vld1.32         {d17},    [r2]!
+        vadd.s32        d18, d16, d17
+        vsub.s32        d19, d16, d17
+        vmlal.s32       q0,  d16, d16
+        vmlal.s32       q1,  d17, d17
+        vmlal.s32       q2,  d18, d18
+        vmlal.s32       q3,  d19, d19
+        subs            r3,  r3,  #2
+        bgt             1b
+        vadd.s64        d0,  d0,  d1
+        vadd.s64        d1,  d2,  d3
+        vadd.s64        d2,  d4,  d5
+        vadd.s64        d3,  d6,  d7
+        vst1.64         {q0-q1},  [r0]
+        bx              lr
+endfunc
+
+function ff_ac3_sum_square_butterfly_float_neon, export=1
+        vmov.f32        q0,  #0.0
+        vmov.f32        q1,  #0.0
+1:
+        vld1.32         {d16},    [r1]!
+        vld1.32         {d17},    [r2]!
+        vadd.f32        d18, d16, d17
+        vsub.f32        d19, d16, d17
+        vmla.f32        d0,  d16, d16
+        vmla.f32        d1,  d17, d17
+        vmla.f32        d2,  d18, d18
+        vmla.f32        d3,  d19, d19
+        subs            r3,  r3,  #2
+        bgt             1b
+        vpadd.f32       d0,  d0,  d1
+        vpadd.f32       d1,  d2,  d3
+        vst1.32         {q0},     [r0]
+        bx              lr
+endfunc
diff --git a/libavcodec/arm/asm-offsets.h b/libavcodec/arm/asm-offsets.h
index 0ea2f04e4a..a2174b0a08 100644
--- a/libavcodec/arm/asm-offsets.h
+++ b/libavcodec/arm/asm-offsets.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_arm.h b/libavcodec/arm/audiodsp_arm.h
index e97e804de7..213660dae7 100644
--- a/libavcodec/arm/audiodsp_arm.h
+++ b/libavcodec/arm/audiodsp_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_init_arm.c b/libavcodec/arm/audiodsp_init_arm.c
index ea9ec3ca10..74aa52a4ef 100644
--- a/libavcodec/arm/audiodsp_init_arm.c
+++ b/libavcodec/arm/audiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized audio functions
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_init_neon.c b/libavcodec/arm/audiodsp_init_neon.c
index af532724c8..f7bd162482 100644
--- a/libavcodec/arm/audiodsp_init_neon.c
+++ b/libavcodec/arm/audiodsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised audio functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/audiodsp_neon.S b/libavcodec/arm/audiodsp_neon.S
index dfb998de32..ab32cef7ab 100644
--- a/libavcodec/arm/audiodsp_neon.S
+++ b/libavcodec/arm/audiodsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised audio functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/blockdsp_arm.h b/libavcodec/arm/blockdsp_arm.h
index 6d9c2c3ed2..59ebeb8466 100644
--- a/libavcodec/arm/blockdsp_arm.h
+++ b/libavcodec/arm/blockdsp_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -21,6 +21,6 @@
 
 #include "libavcodec/blockdsp.h"
 
-void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth);
+void ff_blockdsp_init_neon(BlockDSPContext *c);
 
 #endif /* AVCODEC_ARM_BLOCKDSP_ARM_H */
diff --git a/libavcodec/arm/blockdsp_init_arm.c b/libavcodec/arm/blockdsp_init_arm.c
index a0c03674d7..2080d5253f 100644
--- a/libavcodec/arm/blockdsp_init_arm.c
+++ b/libavcodec/arm/blockdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized block operations
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,10 +24,10 @@
 #include "libavcodec/blockdsp.h"
 #include "blockdsp_arm.h"
 
-av_cold void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth)
+av_cold void ff_blockdsp_init_arm(BlockDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags))
-        ff_blockdsp_init_neon(c, high_bit_depth);
+        ff_blockdsp_init_neon(c);
 }
diff --git a/libavcodec/arm/blockdsp_init_neon.c b/libavcodec/arm/blockdsp_init_neon.c
index 5081cf0cdf..87c0d6d6eb 100644
--- a/libavcodec/arm/blockdsp_init_neon.c
+++ b/libavcodec/arm/blockdsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised block operations
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,10 +28,8 @@
 void ff_clear_block_neon(int16_t *block);
 void ff_clear_blocks_neon(int16_t *blocks);
 
-av_cold void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth)
+av_cold void ff_blockdsp_init_neon(BlockDSPContext *c)
 {
-    if (!high_bit_depth) {
-        c->clear_block  = ff_clear_block_neon;
-        c->clear_blocks = ff_clear_blocks_neon;
-    }
+      c->clear_block  = ff_clear_block_neon;
+      c->clear_blocks = ff_clear_blocks_neon;
 }
diff --git a/libavcodec/arm/blockdsp_neon.S b/libavcodec/arm/blockdsp_neon.S
index 98df2c60c4..9fc63cba5b 100644
--- a/libavcodec/arm/blockdsp_neon.S
+++ b/libavcodec/arm/blockdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised block functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index 6ff5f1a385..fdbf86b45e 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -59,12 +59,18 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
         "tst        %[r_c]        , %[r_c]                      \n\t"
         "bne        2f                                          \n\t"
         "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
+#if UNCHECKED_BITSTREAM_READER
+        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
+        "add        %[r_c]        , %[r_c]      , #2            \n\t"
+        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
+#else
         "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
         "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
         "cmp        %[r_c]        , %[r_b]                      \n\t"
         "itt        lt                                          \n\t"
         "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
         "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
+#endif
         "sub        %[r_c]        , %[low]      , #1            \n\t"
         "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
         "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
index 4aed57603e..ae4b730a8a 100644
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,10 +24,9 @@
 #include <stdint.h>
 
 #include "config.h"
-#include "libavcodec/dcadsp.h"
 #include "libavcodec/mathops.h"
 
-#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4)
+#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
 
 #define decode_blockcodes decode_blockcodes
 static inline int decode_blockcodes(int code1, int code2, int levels,
@@ -35,46 +34,44 @@ static inline int decode_blockcodes(int code1, int code2, int levels,
 {
     int32_t v0, v1, v2, v3, v4, v5;
 
-    __asm__ ("smmul   %8,  %14, %18           \n"
-             "smmul   %11, %15, %18           \n"
-             "smlabb  %14, %8,  %17, %14      \n"
-             "smlabb  %15, %11, %17, %15      \n"
-             "smmul   %9,  %8,  %18           \n"
-             "smmul   %12, %11, %18           \n"
-             "sub     %14, %14, %16, lsr #1   \n"
-             "sub     %15, %15, %16, lsr #1   \n"
-             "smlabb  %8,  %9,  %17, %8       \n"
-             "smlabb  %11, %12, %17, %11      \n"
-             "smmul   %10, %9,  %18           \n"
-             "smmul   %13, %12, %18           \n"
-             "str     %14, %0                 \n"
-             "str     %15, %4                 \n"
-             "sub     %8,  %8,  %16, lsr #1   \n"
-             "sub     %11, %11, %16, lsr #1   \n"
-             "smlabb  %9,  %10, %17, %9       \n"
-             "smlabb  %12, %13, %17, %12      \n"
-             "smmul   %14, %10, %18           \n"
-             "smmul   %15, %13, %18           \n"
-             "str     %8,  %1                 \n"
-             "str     %11, %5                 \n"
-             "sub     %9,  %9,  %16, lsr #1   \n"
-             "sub     %12, %12, %16, lsr #1   \n"
-             "smlabb  %10, %14, %17, %10      \n"
-             "smlabb  %13, %15, %17, %13      \n"
-             "str     %9,  %2                 \n"
-             "str     %12, %6                 \n"
-             "sub     %10, %10, %16, lsr #1   \n"
-             "sub     %13, %13, %16, lsr #1   \n"
-             "str     %10, %3                 \n"
-             "str     %13, %7                 \n"
-             : "=m"(values[0]), "=m"(values[1]),
-               "=m"(values[2]), "=m"(values[3]),
-               "=m"(values[4]), "=m"(values[5]),
-               "=m"(values[6]), "=m"(values[7]),
-               "=&r"(v0), "=&r"(v1), "=&r"(v2),
+    __asm__ ("smmul   %0,  %6,  %10           \n"
+             "smmul   %3,  %7,  %10           \n"
+             "smlabb  %6,  %0,  %9,  %6       \n"
+             "smlabb  %7,  %3,  %9,  %7       \n"
+             "smmul   %1,  %0,  %10           \n"
+             "smmul   %4,  %3,  %10           \n"
+             "sub     %6,  %6,  %8,  lsr #1   \n"
+             "sub     %7,  %7,  %8,  lsr #1   \n"
+             "smlabb  %0,  %1,  %9,  %0       \n"
+             "smlabb  %3,  %4,  %9,  %3       \n"
+             "smmul   %2,  %1,  %10           \n"
+             "smmul   %5,  %4,  %10           \n"
+             "str     %6,  [%11, #0]          \n"
+             "str     %7,  [%11, #16]         \n"
+             "sub     %0,  %0,  %8,  lsr #1   \n"
+             "sub     %3,  %3,  %8,  lsr #1   \n"
+             "smlabb  %1,  %2,  %9,  %1       \n"
+             "smlabb  %4,  %5,  %9,  %4       \n"
+             "smmul   %6,  %2,  %10           \n"
+             "smmul   %7,  %5,  %10           \n"
+             "str     %0,  [%11, #4]          \n"
+             "str     %3,  [%11, #20]         \n"
+             "sub     %1,  %1,  %8,  lsr #1   \n"
+             "sub     %4,  %4,  %8,  lsr #1   \n"
+             "smlabb  %2,  %6,  %9,  %2       \n"
+             "smlabb  %5,  %7,  %9,  %5       \n"
+             "str     %1,  [%11, #8]          \n"
+             "str     %4,  [%11, #24]         \n"
+             "sub     %2,  %2,  %8,  lsr #1   \n"
+             "sub     %5,  %5,  %8,  lsr #1   \n"
+             "str     %2,  [%11, #12]         \n"
+             "str     %5,  [%11, #28]         \n"
+             : "=&r"(v0), "=&r"(v1), "=&r"(v2),
                "=&r"(v3), "=&r"(v4), "=&r"(v5),
                "+&r"(code1), "+&r"(code2)
-             : "r"(levels - 1), "r"(-levels), "r"(ff_inverse[levels]));
+             : "r"(levels - 1), "r"(-levels),
+               "r"(ff_inverse[levels]), "r"(values)
+             : "memory");
 
     return code1 | code2;
 }
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
deleted file mode 100644
index 735c4c28e5..0000000000
--- a/libavcodec/arm/dcadsp_neon.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_dca_lfe_fir0_neon, export=1
-        push            {r4-r6,lr}
-        mov             r3,  #32                @ decifactor
-        mov             r6,  #256/32
-        b               dca_lfe_fir
-endfunc
-
-function ff_dca_lfe_fir1_neon, export=1
-        push            {r4-r6,lr}
-        mov             r3,  #64                @ decifactor
-        mov             r6,  #256/64
-dca_lfe_fir:
-        add             r4,  r0,  r3,  lsl #2   @ out2
-        add             r5,  r2,  #256*4-16     @ cf1
-        sub             r1,  r1,  #12
-        mov             lr,  #-16
-1:
-        vmov.f32        q2,  #0.0               @ v0
-        vmov.f32        q3,  #0.0               @ v1
-        mov             r12, r6
-2:
-        vld1.32         {q8},     [r2,:128]!    @ cf0
-        vld1.32         {q9},     [r5,:128], lr @ cf1
-        vld1.32         {q1},     [r1], lr      @ in
-        subs            r12, r12, #4
-        vrev64.32       q10, q8
-        vmla.f32        q3,  q1,  q9
-        vmla.f32        d4,  d2,  d21
-        vmla.f32        d5,  d3,  d20
-        bne             2b
-
-        add             r1,  r1,  r6,  lsl #2
-        subs            r3,  r3,  #1
-        vadd.f32        d4,  d4,  d5
-        vadd.f32        d6,  d6,  d7
-        vpadd.f32       d5,  d4,  d6
-        vst1.32         {d5[0]},  [r0,:32]!
-        vst1.32         {d5[1]},  [r4,:32]!
-        bne             1b
-
-        pop             {r4-r6,pc}
-endfunc
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
deleted file mode 100644
index c9114d499a..0000000000
--- a/libavcodec/arm/dcadsp_vfp.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (c) 2013 RISC OS Open Ltd
- * Author: Ben Avison <bavison@riscosopen.org>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-POUT          .req    a1
-PIN           .req    a2
-PCOEF         .req    a3
-OLDFPSCR      .req    a4
-COUNTER       .req    ip
-
-IN0           .req    s4
-IN1           .req    s5
-IN2           .req    s6
-IN3           .req    s7
-IN4           .req    s0
-IN5           .req    s1
-IN6           .req    s2
-IN7           .req    s3
-COEF0         .req    s8   @ coefficient elements
-COEF1         .req    s9
-COEF2         .req    s10
-COEF3         .req    s11
-COEF4         .req    s12
-COEF5         .req    s13
-COEF6         .req    s14
-COEF7         .req    s15
-ACCUM0        .req    s16  @ double-buffered multiply-accumulate results
-ACCUM4        .req    s20
-POST0         .req    s24  @ do long-latency post-multiply in this vector in parallel
-POST1         .req    s25
-POST2         .req    s26
-POST3         .req    s27
-
-
-.macro inner_loop  decifactor, dir, tail, head
- .ifc "\dir","up"
-  .set X, 0
-  .set Y, 4
- .else
-  .set X, 4*JMAX*4 - 4
-  .set Y, -4
- .endif
- .ifnc "\head",""
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
- .endif
- .ifnc "\tail",""
-        vadd.f  POST0, ACCUM0, ACCUM4   @ vector operation
- .endif
- .ifnc "\head",""
-        vmul.f  ACCUM0, COEF0, IN0      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
- .endif
- .ifnc "\head",""
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
-   .ifc "\tail",""
-        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
-   .endif
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
-   .ifnc "\tail",""
-        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
-   .endif
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
- .endif
- .ifnc "\tail",""
-        vstmia  POUT!, {POST0-POST3}
- .endif
- .ifnc "\head",""
-        vmla.f  ACCUM0, COEF0, IN2      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
-        vmla.f  ACCUM4, COEF4, IN3      @ vector = vector * scalar
-  .if \decifactor == 32
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
-        vmla.f  ACCUM0, COEF0, IN4      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
-        vmla.f  ACCUM4, COEF4, IN5      @ vector = vector * scalar
-        vldr    COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
-        vldr    COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
-        vldr    COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
-        vldr    COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
-        vmla.f  ACCUM0, COEF0, IN6      @ vector = vector * scalar
-        vldr    COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
-        vldr    COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
-        vldr    COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
-        vldr    COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
-        vmla.f  ACCUM4, COEF4, IN7      @ vector = vector * scalar
-  .endif
- .endif
-.endm
-
-.macro dca_lfe_fir  decifactor
-function ff_dca_lfe_fir\decifactor\()_vfp, export=1
-        fmrx    OLDFPSCR, FPSCR
-        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
-        fmxr    FPSCR, ip
-        vldr    IN0, [PIN, #-0*4]
-        vldr    IN1, [PIN, #-1*4]
-        vldr    IN2, [PIN, #-2*4]
-        vldr    IN3, [PIN, #-3*4]
- .if \decifactor == 32
-  .set JMAX, 8
-        vpush   {s16-s31}
-        vldr    IN4, [PIN, #-4*4]
-        vldr    IN5, [PIN, #-5*4]
-        vldr    IN6, [PIN, #-6*4]
-        vldr    IN7, [PIN, #-7*4]
- .else
-  .set JMAX, 4
-        vpush   {s16-s27}
- .endif
-
-        mov     COUNTER, #\decifactor/4 - 1
-        inner_loop  \decifactor, up,, head
-1:      add     PCOEF, PCOEF, #4*JMAX*4
-        subs    COUNTER, COUNTER, #1
-        inner_loop  \decifactor, up, tail, head
-        bne     1b
-        inner_loop  \decifactor, up, tail
-
-        mov     COUNTER, #\decifactor/4 - 1
-        inner_loop  \decifactor, down,, head
-1:      sub     PCOEF, PCOEF, #4*JMAX*4
-        subs    COUNTER, COUNTER, #1
-        inner_loop  \decifactor, down, tail, head
-        bne     1b
-        inner_loop  \decifactor, down, tail
-
- .if \decifactor == 32
-        vpop    {s16-s31}
- .else
-        vpop    {s16-s27}
- .endif
-        fmxr    FPSCR, OLDFPSCR
-        bx      lr
-endfunc
-.endm
-
-        dca_lfe_fir  64
- .ltorg
-        dca_lfe_fir  32
-
-        .unreq  POUT
-        .unreq  PIN
-        .unreq  PCOEF
-        .unreq  OLDFPSCR
-        .unreq  COUNTER
-
-        .unreq  IN0
-        .unreq  IN1
-        .unreq  IN2
-        .unreq  IN3
-        .unreq  IN4
-        .unreq  IN5
-        .unreq  IN6
-        .unreq  IN7
-        .unreq  COEF0
-        .unreq  COEF1
-        .unreq  COEF2
-        .unreq  COEF3
-        .unreq  COEF4
-        .unreq  COEF5
-        .unreq  COEF6
-        .unreq  COEF7
-        .unreq  ACCUM0
-        .unreq  ACCUM4
-        .unreq  POST0
-        .unreq  POST1
-        .unreq  POST2
-        .unreq  POST3
-
-
-IN      .req    a1
-SBACT   .req    a2
-OLDFPSCR .req   a3
-IMDCT   .req    a4
-WINDOW  .req    v1
-OUT     .req    v2
-BUF     .req    v3
-SCALEINT .req   v4 @ only used in softfp case
-COUNT   .req    v5
-
-SCALE   .req    s0
-
-/* Stack layout differs in softfp and hardfp cases:
- *
- * hardfp
- *      fp -> 6 arg words saved by caller
- *            a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
- *            s16-s23 on entry
- *            align 16
- *     buf -> 8*32*4 bytes buffer
- *            s0 on entry
- *      sp -> 3 arg words for callee
- *
- * softfp
- *      fp -> 7 arg words saved by caller
- *            a4,v1-v5,fp,lr on entry
- *            s16-s23 on entry
- *            align 16
- *     buf -> 8*32*4 bytes buffer
- *      sp -> 4 arg words for callee
- */
-
-/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
- *                                 SynthFilterContext *synth, FFTContext *imdct,
- *                                 float (*synth_buf_ptr)[512],
- *                                 int *synth_buf_offset, float (*synth_buf2)[32],
- *                                 const float (*window)[512], float *samples_out,
- *                                 float (*raXin)[32], float scale);
- */
-function ff_dca_qmf_32_subbands_vfp, export=1
-VFP     push    {a3-a4,v1-v3,v5,fp,lr}
-NOVFP   push    {a4,v1-v5,fp,lr}
-        add     fp, sp, #8*4
-        vpush   {s16-s23}
-        @ The buffer pointed at by raXin isn't big enough for us to do a
-        @ complete matrix transposition as we want to, so allocate an
-        @ alternative buffer from the stack. Align to 4 words for speed.
-        sub     BUF, sp, #8*32*4
-        bic     BUF, BUF, #15
-        mov     sp, BUF
-        ldr     lr, =0x03330000     @ RunFast mode, short vectors of length 4, stride 2
-        fmrx    OLDFPSCR, FPSCR
-        fmxr    FPSCR, lr
-        @ COUNT is used to count down 2 things at once:
-        @ bits 0-4 are the number of word pairs remaining in the output row
-        @ bits 5-31 are the number of words to copy (with possible negation)
-        @   from the source matrix before we start zeroing the remainder
-        mov     COUNT, #(-4 << 5) + 16
-        adds    COUNT, COUNT, SBACT, lsl #5
-        bmi     2f
-1:
-        vldr    s8,  [IN, #(0*8+0)*4]
-        vldr    s10, [IN, #(0*8+1)*4]
-        vldr    s12, [IN, #(0*8+2)*4]
-        vldr    s14, [IN, #(0*8+3)*4]
-        vldr    s16, [IN, #(0*8+4)*4]
-        vldr    s18, [IN, #(0*8+5)*4]
-        vldr    s20, [IN, #(0*8+6)*4]
-        vldr    s22, [IN, #(0*8+7)*4]
-        vneg.f  s8, s8
-        vldr    s9,  [IN, #(1*8+0)*4]
-        vldr    s11, [IN, #(1*8+1)*4]
-        vldr    s13, [IN, #(1*8+2)*4]
-        vldr    s15, [IN, #(1*8+3)*4]
-        vneg.f  s16, s16
-        vldr    s17, [IN, #(1*8+4)*4]
-        vldr    s19, [IN, #(1*8+5)*4]
-        vldr    s21, [IN, #(1*8+6)*4]
-        vldr    s23, [IN, #(1*8+7)*4]
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        vldr    s9,  [IN, #(3*8+0)*4]
-        vldr    s11, [IN, #(3*8+1)*4]
-        vldr    s13, [IN, #(3*8+2)*4]
-        vldr    s15, [IN, #(3*8+3)*4]
-        vldr    s17, [IN, #(3*8+4)*4]
-        vldr    s19, [IN, #(3*8+5)*4]
-        vldr    s21, [IN, #(3*8+6)*4]
-        vldr    s23, [IN, #(3*8+7)*4]
-        vneg.f  s9, s9
-        vldr    s8,  [IN, #(2*8+0)*4]
-        vldr    s10, [IN, #(2*8+1)*4]
-        vldr    s12, [IN, #(2*8+2)*4]
-        vldr    s14, [IN, #(2*8+3)*4]
-        vneg.f  s17, s17
-        vldr    s16, [IN, #(2*8+4)*4]
-        vldr    s18, [IN, #(2*8+5)*4]
-        vldr    s20, [IN, #(2*8+6)*4]
-        vldr    s22, [IN, #(2*8+7)*4]
-        vstr    d4,  [BUF, #(0*32+2)*4]
-        vstr    d5,  [BUF, #(1*32+2)*4]
-        vstr    d6,  [BUF, #(2*32+2)*4]
-        vstr    d7,  [BUF, #(3*32+2)*4]
-        vstr    d8,  [BUF, #(4*32+2)*4]
-        vstr    d9,  [BUF, #(5*32+2)*4]
-        vstr    d10, [BUF, #(6*32+2)*4]
-        vstr    d11, [BUF, #(7*32+2)*4]
-        add     IN, IN, #4*8*4
-        add     BUF, BUF, #4*4
-        subs    COUNT, COUNT, #(4 << 5) + 2
-        bpl     1b
-2:      @ Now deal with trailing < 4 samples
-        adds    COUNT, COUNT, #3 << 5
-        bmi     4f  @ sb_act was a multiple of 4
-        bics    lr, COUNT, #0x1F
-        bne     3f
-        @ sb_act was n*4+1
-        vldr    s8,  [IN, #(0*8+0)*4]
-        vldr    s10, [IN, #(0*8+1)*4]
-        vldr    s12, [IN, #(0*8+2)*4]
-        vldr    s14, [IN, #(0*8+3)*4]
-        vldr    s16, [IN, #(0*8+4)*4]
-        vldr    s18, [IN, #(0*8+5)*4]
-        vldr    s20, [IN, #(0*8+6)*4]
-        vldr    s22, [IN, #(0*8+7)*4]
-        vneg.f  s8, s8
-        vldr    s9,  zero
-        vldr    s11, zero
-        vldr    s13, zero
-        vldr    s15, zero
-        vneg.f  s16, s16
-        vldr    s17, zero
-        vldr    s19, zero
-        vldr    s21, zero
-        vldr    s23, zero
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        sub     COUNT, COUNT, #1
-        b       4f
-3:      @ sb_act was n*4+2 or n*4+3, so do the first 2
-        vldr    s8,  [IN, #(0*8+0)*4]
-        vldr    s10, [IN, #(0*8+1)*4]
-        vldr    s12, [IN, #(0*8+2)*4]
-        vldr    s14, [IN, #(0*8+3)*4]
-        vldr    s16, [IN, #(0*8+4)*4]
-        vldr    s18, [IN, #(0*8+5)*4]
-        vldr    s20, [IN, #(0*8+6)*4]
-        vldr    s22, [IN, #(0*8+7)*4]
-        vneg.f  s8, s8
-        vldr    s9,  [IN, #(1*8+0)*4]
-        vldr    s11, [IN, #(1*8+1)*4]
-        vldr    s13, [IN, #(1*8+2)*4]
-        vldr    s15, [IN, #(1*8+3)*4]
-        vneg.f  s16, s16
-        vldr    s17, [IN, #(1*8+4)*4]
-        vldr    s19, [IN, #(1*8+5)*4]
-        vldr    s21, [IN, #(1*8+6)*4]
-        vldr    s23, [IN, #(1*8+7)*4]
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        sub     COUNT, COUNT, #(2 << 5) + 1
-        bics    lr, COUNT, #0x1F
-        bne     4f
-        @ sb_act was n*4+3
-        vldr    s8,  [IN, #(2*8+0)*4]
-        vldr    s10, [IN, #(2*8+1)*4]
-        vldr    s12, [IN, #(2*8+2)*4]
-        vldr    s14, [IN, #(2*8+3)*4]
-        vldr    s16, [IN, #(2*8+4)*4]
-        vldr    s18, [IN, #(2*8+5)*4]
-        vldr    s20, [IN, #(2*8+6)*4]
-        vldr    s22, [IN, #(2*8+7)*4]
-        vldr    s9,  zero
-        vldr    s11, zero
-        vldr    s13, zero
-        vldr    s15, zero
-        vldr    s17, zero
-        vldr    s19, zero
-        vldr    s21, zero
-        vldr    s23, zero
-        vstr    d4,  [BUF, #(0*32+0)*4]
-        vstr    d5,  [BUF, #(1*32+0)*4]
-        vstr    d6,  [BUF, #(2*32+0)*4]
-        vstr    d7,  [BUF, #(3*32+0)*4]
-        vstr    d8,  [BUF, #(4*32+0)*4]
-        vstr    d9,  [BUF, #(5*32+0)*4]
-        vstr    d10, [BUF, #(6*32+0)*4]
-        vstr    d11, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        sub     COUNT, COUNT, #1
-4:      @ Now fill the remainder with 0
-        vldr    s8, zero
-        vldr    s9, zero
-        ands    COUNT, COUNT, #0x1F
-        beq     6f
-5:      vstr    d4, [BUF, #(0*32+0)*4]
-        vstr    d4, [BUF, #(1*32+0)*4]
-        vstr    d4, [BUF, #(2*32+0)*4]
-        vstr    d4, [BUF, #(3*32+0)*4]
-        vstr    d4, [BUF, #(4*32+0)*4]
-        vstr    d4, [BUF, #(5*32+0)*4]
-        vstr    d4, [BUF, #(6*32+0)*4]
-        vstr    d4, [BUF, #(7*32+0)*4]
-        add     BUF, BUF, #2*4
-        subs    COUNT, COUNT, #1
-        bne     5b
-6:
-        fmxr    FPSCR, OLDFPSCR
-        ldr     WINDOW, [fp, #3*4]
-        ldr     OUT, [fp, #4*4]
-        sub     BUF, BUF, #32*4
-NOVFP   ldr     SCALEINT, [fp, #6*4]
-        mov     COUNT, #8
-VFP     vpush   {SCALE}
-VFP     sub     sp, sp, #3*4
-NOVFP   sub     sp, sp, #4*4
-7:
-VFP     ldr     a1, [fp, #-7*4]     @ imdct
-NOVFP   ldr     a1, [fp, #-8*4]
-        ldmia   fp, {a2-a4}
-VFP     stmia   sp, {WINDOW, OUT, BUF}
-NOVFP   stmia   sp, {WINDOW, OUT, BUF, SCALEINT}
-VFP     vldr    SCALE, [sp, #3*4]
-        bl      X(ff_synth_filter_float_vfp)
-        add     OUT, OUT, #32*4
-        add     BUF, BUF, #32*4
-        subs    COUNT, COUNT, #1
-        bne     7b
-
-A       sub     sp, fp, #(8+8)*4
-T       sub     fp, fp, #(8+8)*4
-T       mov     sp, fp
-        vpop    {s16-s23}
-VFP     pop     {a3-a4,v1-v3,v5,fp,pc}
-NOVFP   pop     {a4,v1-v5,fp,pc}
-endfunc
-
-        .unreq  IN
-        .unreq  SBACT
-        .unreq  OLDFPSCR
-        .unreq  IMDCT
-        .unreq  WINDOW
-        .unreq  OUT
-        .unreq  BUF
-        .unreq  SCALEINT
-        .unreq  COUNT
-
-        .unreq  SCALE
-
-        .align 2
-zero:   .word   0
diff --git a/libavcodec/arm/fft_fixed_init_arm.c b/libavcodec/arm/fft_fixed_init_arm.c
index 5132b0959f..11226d65ff 100644
--- a/libavcodec/arm/fft_fixed_init_arm.c
+++ b/libavcodec/arm/fft_fixed_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -26,6 +26,8 @@
 #include "libavcodec/fft.h"
 
 void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z);
+void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
+void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
 
 av_cold void ff_fft_fixed_init_arm(FFTContext *s)
 {
@@ -33,6 +35,16 @@ av_cold void ff_fft_fixed_init_arm(FFTContext *s)
 
     if (have_neon(cpu_flags)) {
         s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
+#if CONFIG_FFT
         s->fft_calc        = ff_fft_fixed_calc_neon;
+#endif
+
+#if CONFIG_MDCT
+        if (!s->inverse && s->nbits >= 3) {
+            s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+            s->mdct_calc        = ff_mdct_fixed_calc_neon;
+            s->mdct_calcw       = ff_mdct_fixed_calcw_neon;
+        }
+#endif
     }
 }
diff --git a/libavcodec/arm/fft_fixed_neon.S b/libavcodec/arm/fft_fixed_neon.S
index c70a18991a..2651607544 100644
--- a/libavcodec/arm/fft_fixed_neon.S
+++ b/libavcodec/arm/fft_fixed_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 4d047eaf13..331bd65e5c 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,16 +29,33 @@ void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
+void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
 av_cold void ff_fft_init_arm(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (have_vfp_vm(cpu_flags)) {
         s->fft_calc     = ff_fft_calc_vfp;
+#if CONFIG_MDCT
+        s->imdct_half   = ff_imdct_half_vfp;
+#endif
     }
 
     if (have_neon(cpu_flags)) {
+#if CONFIG_FFT
         s->fft_permute  = ff_fft_permute_neon;
         s->fft_calc     = ff_fft_calc_neon;
+#endif
+#if CONFIG_MDCT
+        s->imdct_calc   = ff_imdct_calc_neon;
+        s->imdct_half   = ff_imdct_half_neon;
+        s->mdct_calc    = ff_mdct_calc_neon;
+        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
     }
 }
diff --git a/libavcodec/arm/fft_neon.S b/libavcodec/arm/fft_neon.S
index b161015e39..48f8dfc424 100644
--- a/libavcodec/arm/fft_neon.S
+++ b/libavcodec/arm/fft_neon.S
@@ -7,20 +7,20 @@
  * This algorithm (though not any of the implementation details) is
  * based on libdjbfft by D. J. Bernstein.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index c2801fa1a9..ac601325f2 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/flacdsp_arm.S b/libavcodec/arm/flacdsp_arm.S
index d4441da1bb..f8861c5967 100644
--- a/libavcodec/arm/flacdsp_arm.S
+++ b/libavcodec/arm/flacdsp_arm.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/flacdsp_init_arm.c b/libavcodec/arm/flacdsp_init_arm.c
index 0530cf7a85..564e3dc79b 100644
--- a/libavcodec/arm/flacdsp_init_arm.c
+++ b/libavcodec/arm/flacdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -24,9 +24,9 @@
 void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
                         int qlevel, int len);
 
-av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt,
+av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
                                  int bps)
 {
-    if (bps <= 16)
-        c->lpc = ff_flac_lpc_16_arm;
+    if (CONFIG_FLAC_DECODER)
+        c->lpc16 = ff_flac_lpc_16_arm;
 }
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
index 11396e898c..a734decec0 100644
--- a/libavcodec/arm/fmtconvert_init_arm.c
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * ARM optimized Format Conversion Utils
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S
index 5d48e3d197..738953e8fc 100644
--- a/libavcodec/arm/fmtconvert_neon.S
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -3,20 +3,20 @@
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2015 Janne Grunau  <janne-libav@jannau.net>b
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
index 4e43f425a5..b14af454eb 100644
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/g722dsp_init_arm.c b/libavcodec/arm/g722dsp_init_arm.c
index 5edf619f17..c0e5d8b989 100644
--- a/libavcodec/arm/g722dsp_init_arm.c
+++ b/libavcodec/arm/g722dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/g722dsp_neon.S b/libavcodec/arm/g722dsp_neon.S
index 5fa3c279e9..757e53f167 100644
--- a/libavcodec/arm/g722dsp_neon.S
+++ b/libavcodec/arm/g722dsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions for G722 coding
  * Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264chroma_init_arm.c b/libavcodec/arm/h264chroma_init_arm.c
index 6f365533cf..13f7e0d702 100644
--- a/libavcodec/arm/h264chroma_init_arm.c
+++ b/libavcodec/arm/h264chroma_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised H.264 chroma functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index ee7011b00b..fc48a6f8f6 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -455,7 +455,7 @@ endconst
         h264_chroma_mc4 avg, rv40
 #endif
 
-#if CONFIG_VC1_DECODER
+#if CONFIG_VC1DSP
         h264_chroma_mc8 put, vc1
         h264_chroma_mc8 avg, vc1
         h264_chroma_mc4 put, vc1
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index 7afd350890..90144d0da2 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -72,11 +72,14 @@ void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
 static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
                                       const int chroma_format_idc)
 {
+#if HAVE_NEON
     if (bit_depth == 8) {
         c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
         c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
         c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+        if(chroma_format_idc == 1){
         c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+        }
 
         c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
         c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
@@ -96,6 +99,7 @@ static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
         c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
         c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
     }
+#endif // HAVE_NEON
 }
 
 av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
@@ -103,8 +107,10 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_ARMV6
     if (have_setend(cpu_flags))
         c->startcode_find_candidate = ff_startcode_find_candidate_armv6;
+#endif
     if (have_neon(cpu_flags))
         h264dsp_init_neon(c, bit_depth, chroma_format_idc);
 }
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 5e75565b3e..274a547f26 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index f588f3e744..4f68bdb9f5 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264pred_init_arm.c b/libavcodec/arm/h264pred_init_arm.c
index a445d4d667..cc324d7dca 100644
--- a/libavcodec/arm/h264pred_init_arm.c
+++ b/libavcodec/arm/h264pred_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -49,6 +49,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
                                         const int bit_depth,
                                         const int chroma_format_idc)
 {
+#if HAVE_NEON
     const int high_depth = bit_depth > 8;
 
     if (high_depth)
@@ -81,6 +82,7 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
     if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
         codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
         h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+#endif // HAVE_NEON
 }
 
 av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
diff --git a/libavcodec/arm/h264pred_neon.S b/libavcodec/arm/h264pred_neon.S
index 332f94bd53..4dc47ba8f1 100644
--- a/libavcodec/arm/h264pred_neon.S
+++ b/libavcodec/arm/h264pred_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264qpel_init_arm.c b/libavcodec/arm/h264qpel_init_arm.c
index 01615b5719..71237be359 100644
--- a/libavcodec/arm/h264qpel_init_arm.c
+++ b/libavcodec/arm/h264qpel_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/h264qpel_neon.S b/libavcodec/arm/h264qpel_neon.S
index 6c51250d5b..21336c6c32 100644
--- a/libavcodec/arm/h264qpel_neon.S
+++ b/libavcodec/arm/h264qpel_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hevcdsp_arm.h b/libavcodec/arm/hevcdsp_arm.h
new file mode 100644
index 0000000000..7735df9cd2
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_arm.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
+#define AVCODEC_ARM_HEVCDSP_ARM_H
+
+#include "libavcodec/hevcdsp.h"
+
+void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth);
+
+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
new file mode 100644
index 0000000000..166bddb104
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_chroma_start
+        ldr      r12, [r2]
+        ldr      r3, [r2, #4]
+        add      r2, r3, r12
+        cmp      r2, #0
+        it       eq
+        bxeq     lr
+.endm
+
+.macro hevc_loop_filter_chroma_body
+        vsubl.u8  q3, d4, d2
+        vsubl.u8  q11, d18, d19
+        vshl.i16  q3, #2
+        vadd.i16  q11, q3
+        vdup.16   d0, r12
+        vdup.16   d1, r3
+        vrshr.s16 q11, q11, #3
+        vneg.s16  q12, q0
+        vmovl.u8  q2, d4
+        vmin.s16  q11, q11, q0
+        vmax.s16  q11, q11, q12
+        vaddw.u8  q1, q11, d2
+        vsub.i16  q2, q11
+        vqmovun.s16 d2, q1
+        vqmovun.s16 d4, q2
+.endm
+
+.macro hevc_loop_filter_luma_start
+        ldr     r12, [r3]
+        ldr      r3, [r3, #4]
+        lsl      r3, #16
+        orr      r3, r12
+        cmp      r3, #0
+        it       eq
+        bxeq     lr
+        lsr      r3, #16
+.endm
+
+.macro hevc_loop_filter_luma_body
+        vmovl.u8  q8, d16
+        vmovl.u8  q9, d18
+        vmovl.u8  q10, d20
+        vmovl.u8  q11, d22
+        vmovl.u8  q12, d24
+        vmovl.u8  q13, d26
+        vmovl.u8  q14, d28
+        vmovl.u8  q15, d30
+
+        vadd.i16   q7, q9, q11
+        vadd.i16   q6, q14, q12
+        vsub.i16   q7, q10
+        vsub.i16   q6, q13
+        vabd.s16   q7, q7, q10
+        vabd.s16   q6, q6, q13
+
+
+        vdup.16    q0, r2
+        vmov       q4, q7
+        vmov       q5, q6
+        vdup.16    d4, r12
+        vtrn.16    q7, q4
+        vtrn.16    q6, q5
+
+        vshl.u64   q7, #32
+        vshr.u64   q4, #32
+        vshl.u64   q6, #32
+        vshr.u64   q5, #32
+        vshr.u64   q7, #32
+        vshr.u64   q6, #32
+        vshl.u64   q5, #32
+        vshl.u64   q4, #32
+        vorr       q6, q5
+        vorr       q7, q4
+        vdup.16    d5, r3
+        vadd.i16   q5, q7, q6
+
+        vmov       q4, q5
+        vmov       q3, q5
+        vtrn.32    q3, q4
+
+        vadd.i16   q4, q3
+
+        vshl.s16   q5, q5, #1
+        vcgt.s16   q3, q0, q4
+
+        vmovn.i16  d6, q3
+        vshr.s16   q1, q0, #2
+        vmovn.i16  d6, q3
+        vcgt.s16   q5, q1, q5
+        vmov       r7, s12
+        cmp        r7, #0
+        beq        bypasswrite
+
+        vpadd.i32  d0, d14, d12
+        vpadd.i32  d1, d15, d13
+        vmov       q4, q2
+        vshl.s16   q2, #2
+        vshr.s16   q1, q1, #1
+        vrhadd.s16 q2, q4
+
+        vabd.s16   q7, q8, q11
+        vaba.s16   q7, q15, q12
+
+        vmovn.i32  d0, q0
+        vmov       r5, r6, s0, s1
+        vcgt.s16   q6, q1, q7
+        vand       q5, q5, q6
+        vabd.s16   q7, q11, q12
+        vcgt.s16   q6, q2, q7
+        vand       q5, q5, q6
+
+        vmov       q2, q5
+        vtrn.s16   q5, q2
+        vshr.u64   q2, #32
+        vshl.u64   q5, #32
+        vshl.u64   q2, #32
+        vshr.u64   q5, #32
+        vorr       q5, q2
+
+        vmov       q2, q5
+        vshl.i16   q7, q4, #1
+        vtrn.32    q2, q5
+        vand       q5, q2
+        vneg.s16   q6, q7
+        vmovn.i16  d4, q5
+        vmovn.i16  d4, q2
+        vmov       r8, s8
+
+        and        r9, r8, r7
+        cmp        r9, #0
+        beq        weakfilter_\@
+
+        vadd.i16  q2, q11, q12
+        vadd.i16  q4, q9, q8
+        vadd.i16  q1, q2, q10
+        vdup.16   d10, r9
+        vadd.i16  q0, q1, q9
+        vshl.i16  q4, #1
+        lsr        r9, #16
+        vadd.i16  q1, q0
+        vrshr.s16 q3, q0, #2
+        vadd.i16  q1, q13
+        vadd.i16  q4, q0
+        vsub.i16  q3, q10
+        vrshr.s16 q1, #3
+        vrshr.s16 q4, #3
+        vmax.s16  q3, q6
+        vsub.i16  q1, q11
+        vsub.i16  q4, q9
+        vmin.s16  q3, q7
+        vmax.s16  q4, q6
+        vmax.s16  q1, q6
+        vadd.i16  q3, q10
+        vmin.s16  q4, q7
+        vmin.s16  q1, q7
+        vdup.16   d11, r9
+        vadd.i16  q4, q9
+        vadd.i16  q1, q11
+        vbit      q9, q4, q5
+        vadd.i16  q4, q2, q13
+        vbit      q11, q1, q5
+        vadd.i16  q0, q4, q14
+        vadd.i16  q2, q15, q14
+        vadd.i16  q4, q0
+
+        vshl.i16  q2, #1
+        vadd.i16  q4, q10
+        vbit      q10, q3, q5
+        vrshr.s16 q4, #3
+        vadd.i16  q2, q0
+        vrshr.s16 q3, q0, #2
+        vsub.i16  q4, q12
+        vrshr.s16 q2, #3
+        vsub.i16  q3, q13
+        vmax.s16  q4, q6
+        vsub.i16  q2, q14
+        vmax.s16  q3, q6
+        vmin.s16  q4, q7
+        vmax.s16  q2, q6
+        vmin.s16  q3, q7
+        vadd.i16  q4, q12
+        vmin.s16  q2, q7
+        vadd.i16  q3, q13
+        vbit      q12, q4, q5
+        vadd.i16  q2, q14
+        vbit      q13, q3, q5
+        vbit      q14, q2, q5
+
+weakfilter_\@:
+        mvn       r8, r8
+        and       r9, r8, r7
+        cmp       r9, #0
+        beq       ready_\@
+
+        vdup.16    q4, r2
+
+        vdup.16   d10, r9
+        lsr       r9, #16
+        vmov       q1, q4
+        vdup.16   d11, r9
+        vshr.s16   q1, #1
+        vsub.i16  q2, q12, q11
+        vadd.i16   q4, q1
+        vshl.s16  q0, q2, #3
+        vshr.s16   q4, #3
+        vadd.i16  q2, q0
+        vsub.i16  q0, q13, q10
+        vsub.i16  q2, q0
+        vshl.i16  q0, q0, #1
+        vsub.i16  q2, q0
+        vshl.s16  q1, q7, 2
+        vrshr.s16 q2, q2, #4
+        vadd.i16  q1, q7
+        vabs.s16  q3, q2
+        vshr.s16  q6, q6, #1
+        vcgt.s16  q1, q1, q3
+        vand      q5, q1
+        vshr.s16  q7, q7, #1
+        vmax.s16  q2, q2, q6
+        vmin.s16  q2, q2, q7
+
+        vshr.s16  q7, q7, #1
+        vrhadd.s16 q3, q9, q11
+        vneg.s16  q6, q7
+        vsub.s16  q3, q10
+        vdup.16   d2, r5
+        vhadd.s16 q3, q2
+        vdup.16   d3, r6
+        vmax.s16  q3, q3, q6
+        vcgt.s16  q1, q4, q1
+        vmin.s16  q3, q3, q7
+        vand      q1, q5
+        vadd.i16  q3, q10
+        lsr       r5, #16
+        lsr       r6, #16
+        vbit      q10, q3, q1
+
+        vrhadd.s16 q3, q14, q12
+        vdup.16   d2, r5
+        vsub.s16  q3, q13
+        vdup.16   d3, r6
+        vhsub.s16 q3, q2
+        vcgt.s16  q1, q4, q1
+        vmax.s16  q3, q3, q6
+        vand      q1, q5
+        vmin.s16  q3, q3, q7
+        vadd.i16  q3, q13
+        vbit      q13, q3, q1
+        vadd.i16  q0, q11, q2
+        vsub.i16  q4, q12, q2
+        vbit      q11, q0, q5
+        vbit      q12, q4, q5
+
+ready_\@:
+        vqmovun.s16 d16, q8
+        vqmovun.s16 d18, q9
+        vqmovun.s16 d20, q10
+        vqmovun.s16 d22, q11
+        vqmovun.s16 d24, q12
+        vqmovun.s16 d26, q13
+        vqmovun.s16 d28, q14
+        vqmovun.s16 d30, q15
+.endm
+
+function ff_hevc_v_loop_filter_luma_neon, export=1
+        hevc_loop_filter_luma_start
+        push     {r5-r11}
+        vpush    {d8-d15}
+        sub      r0, #4
+        vld1.8   {d16}, [r0], r1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d20}, [r0], r1
+        vld1.8   {d22}, [r0], r1
+        vld1.8   {d24}, [r0], r1
+        vld1.8   {d26}, [r0], r1
+        vld1.8   {d28}, [r0], r1
+        vld1.8   {d30}, [r0], r1
+        sub      r0, r0, r1, lsl #3
+        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+        hevc_loop_filter_luma_body
+        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+        vst1.8   {d16}, [r0], r1
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d22}, [r0], r1
+        vst1.8   {d24}, [r0], r1
+        vst1.8   {d26}, [r0], r1
+        vst1.8   {d28}, [r0], r1
+        vst1.8   {d30}, [r0]
+        vpop     {d8-d15}
+        pop      {r5-r11}
+        bx lr
+endfunc
+
+function ff_hevc_h_loop_filter_luma_neon, export=1
+        hevc_loop_filter_luma_start
+        push     {r5-r11}
+        vpush    {d8-d15}
+        sub      r0, r0, r1, lsl #2
+        vld1.8  {d16}, [r0], r1
+        vld1.8  {d18}, [r0], r1
+        vld1.8  {d20}, [r0], r1
+        vld1.8  {d22}, [r0], r1
+        vld1.8  {d24}, [r0], r1
+        vld1.8  {d26}, [r0], r1
+        vld1.8  {d28}, [r0], r1
+        vld1.8  {d30}, [r0], r1
+        sub        r0, r0, r1, lsl #3
+        add        r0, r1
+        hevc_loop_filter_luma_body
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d22}, [r0], r1
+        vst1.8   {d24}, [r0], r1
+        vst1.8   {d26}, [r0], r1
+        vst1.8   {d28}, [r0]
+bypasswrite:
+        vpop     {d8-d15}
+        pop      {r5-r11}
+        bx lr
+endfunc
+
+function ff_hevc_v_loop_filter_chroma_neon, export=1
+        hevc_loop_filter_chroma_start
+        sub      r0, #4
+        vld1.8   {d16}, [r0], r1
+        vld1.8   {d17}, [r0], r1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d2},  [r0], r1
+        vld1.8   {d4},  [r0], r1
+        vld1.8   {d19}, [r0], r1
+        vld1.8   {d20}, [r0], r1
+        vld1.8   {d21}, [r0], r1
+        sub      r0, r0, r1, lsl #3
+        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+        hevc_loop_filter_chroma_body
+        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+        vst1.8   {d16}, [r0], r1
+        vst1.8   {d17}, [r0], r1
+        vst1.8   {d18}, [r0], r1
+        vst1.8   {d2},  [r0], r1
+        vst1.8   {d4},  [r0], r1
+        vst1.8   {d19}, [r0], r1
+        vst1.8   {d20}, [r0], r1
+        vst1.8   {d21}, [r0]
+        bx       lr
+endfunc
+
+function ff_hevc_h_loop_filter_chroma_neon, export=1
+        hevc_loop_filter_chroma_start
+        sub      r0, r0, r1, lsl #1
+        vld1.8   {d18}, [r0], r1
+        vld1.8   {d2}, [r0], r1
+        vld1.8   {d4}, [r0], r1
+        vld1.8   {d19}, [r0]
+        sub      r0, r0, r1, lsl #1
+        hevc_loop_filter_chroma_body
+        vst1.8   {d2}, [r0], r1
+        vst1.8   {d4}, [r0]
+        bx       lr
+endfunc
diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
new file mode 100644
index 0000000000..e39d00634b
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_idct_neon.S
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+function ff_hevc_idct_4x4_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        vdup.16     q0, r1
+        vdup.16     q1, r1
+        vst1.16     {q0, q1}, [r0]
+        bx lr
+endfunc
+
+function ff_hevc_idct_8x8_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+        vstm        r0, {q8-q15}
+        bx lr
+endfunc
+
+function ff_hevc_idct_16x16_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+        vstm        r0!, {q8-q15}
+        vstm        r0!, {q8-q15}
+        vstm        r0!, {q8-q15}
+        vstm        r0, {q8-q15}
+        bx lr
+endfunc
+
+function ff_hevc_idct_32x32_dc_neon_8, export=1
+        ldrsh       r1, [r0]
+        ldr         r2, =0x20
+        add         r1, #1
+        asr         r1, #1
+        add         r1, r2
+        asr         r1, #6
+        mov         r3, #16
+        vdup.16     q8, r1
+        vdup.16     q9, r1
+        vmov.16     q10, q8
+        vmov.16     q11, q8
+        vmov.16     q12, q8
+        vmov.16     q13, q8
+        vmov.16     q14, q8
+        vmov.16     q15, q8
+1:      subs        r3, #1
+        vstm        r0!, {q8-q15}
+        bne         1b
+        bx lr
+endfunc
+
+function ff_hevc_add_residual_4x4_neon_8, export=1
+        vldm        r1, {q0-q1}
+        vld1.32     d4[0], [r0], r2
+        vld1.32     d4[1], [r0], r2
+        vld1.32     d5[0], [r0], r2
+        vld1.32     d5[1], [r0], r2
+        sub         r0, r0, r2, lsl #2
+        vmovl.u8    q8, d4
+        vmovl.u8    q9, d5
+        vqadd.s16   q0, q0, q8
+        vqadd.s16   q1, q1, q9
+        vqmovun.s16 d0, q0
+        vqmovun.s16 d1, q1
+        vst1.32     d0[0], [r0], r2
+        vst1.32     d0[1], [r0], r2
+        vst1.32     d1[0], [r0], r2
+        vst1.32     d1[1], [r0], r2
+        bx          lr
+endfunc
+
+function ff_hevc_add_residual_8x8_neon_8, export=1
+        mov         r3,   #8
+1:      subs        r3,   #1
+        vld1.16     {q0}, [r1]!
+        vld1.8      d16,  [r0]
+        vmovl.u8    q8,   d16
+        vqadd.s16   q0,   q8
+        vqmovun.s16 d0,   q0
+        vst1.32     d0,   [r0], r2
+        bne         1b
+        bx          lr
+endfunc
+
+function ff_hevc_add_residual_16x16_neon_8, export=1
+        mov         r3,   #16
+1:      subs        r3,   #1
+        vld1.16     {q0, q1}, [r1]!
+        vld1.8      {q8},  [r0]
+        vmovl.u8    q9,  d16
+        vmovl.u8    q10, d17
+        vqadd.s16   q0,  q9
+        vqadd.s16   q1,  q10
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q1
+        vst1.8      {q0},   [r0], r2
+        bne         1b
+        bx          lr
+endfunc
+
+function ff_hevc_add_residual_32x32_neon_8, export=1
+        mov         r3,   #32
+1:      subs        r3,   #1
+        vldm        r1!, {q0-q3}
+        vld1.8      {q8, q9},  [r0]
+        vmovl.u8    q10, d16
+        vmovl.u8    q11, d17
+        vmovl.u8    q12, d18
+        vmovl.u8    q13, d19
+        vqadd.s16   q0,  q10
+        vqadd.s16   q1,  q11
+        vqadd.s16   q2,  q12
+        vqadd.s16   q3,  q13
+        vqmovun.s16 d0,  q0
+        vqmovun.s16 d1,  q1
+        vqmovun.s16 d2,  q2
+        vqmovun.s16 d3,  q3
+        vst1.8     {q0, q1},   [r0], r2
+        bne         1b
+        bx          lr
+endfunc
+
+.macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
+        vtrn.64         \r0, \r4
+        vtrn.64         \r1, \r5
+        vtrn.64         \r2, \r6
+        vtrn.64         \r3, \r7
+        vtrn.32         \r0, \r2
+        vtrn.32         \r1, \r3
+        vtrn.32         \r4, \r6
+        vtrn.32         \r5, \r7
+        vtrn.16         \r0, \r1
+        vtrn.16         \r2, \r3
+        vtrn.16         \r4, \r5
+        vtrn.16         \r6, \r7
+.endm
+
+// in 4 q regs
+// output 8 d regs
+.macro transpose_16b_4x4    r0, r1, r2, r3
+        vtrn.32         \r0, \r2
+        vtrn.32         \r1, \r3
+        vtrn.16         \r0, \r1
+        vtrn.16         \r2, \r3
+.endm
+
+/* uses registers q2 - q9 for temp values */
+/* TODO: reorder */
+.macro tr4_luma_shift r0, r1, r2, r3, shift
+        vaddl.s16   q5, \r0, \r2    // c0 = src0 + src2
+        vaddl.s16   q2, \r2, \r3    // c1 = src2 + src3
+        vsubl.s16   q4, \r0, \r3    // c2 = src0 - src3
+        vmull.s16   q6, \r1, d0[0]  // c3 = 74 * src1
+
+        vaddl.s16   q7, \r0, \r3    // src0 + src3
+        vsubw.s16   q7, q7, \r2     // src0 - src2 + src3
+        vmul.s32    q7, q7, d0[0]   // dst2 = 74 * (src0 - src2 + src3)
+
+        vmul.s32    q8, q5, d0[1]   // 29 * c0
+        vmul.s32    q9, q2, d1[0]   // 55 * c1
+        vadd.s32    q8, q9          // 29 * c0 + 55 * c1
+        vadd.s32    q8, q6          // dst0 = 29 * c0 + 55 * c1 + c3
+
+        vmul.s32    q2, q2, d0[1]   // 29 * c1
+        vmul.s32    q9, q4, d1[0]   // 55 * c2
+        vsub.s32    q9, q2          // 55 * c2 - 29 * c1
+        vadd.s32    q9, q6          // dst1 = 55 * c2 - 29 * c1 + c3
+
+        vmul.s32    q5, q5, d1[0]   // 55 * c0
+        vmul.s32    q4, q4, d0[1]   // 29 * c2
+        vadd.s32    q5, q4          // 55 * c0 + 29 * c2
+        vsub.s32    q5, q6          // dst3 = 55 * c0 + 29 * c2 - c3
+
+        vqrshrn.s32   \r0, q8, \shift
+        vqrshrn.s32   \r1, q9, \shift
+        vqrshrn.s32   \r2, q7, \shift
+        vqrshrn.s32   \r3, q5, \shift
+.endm
+
+/* uses registers q2 - q6 for temp values */
+.macro tr4 r0, r1, r2, r3
+        vmull.s16  q4, \r1, d0[0]   // 83 * src1
+        vmull.s16  q6, \r1, d0[1]   // 36 * src1
+        vshll.s16  q2, \r0, #6   // 64 * src0
+        vshll.s16  q3, \r2, #6   // 64 * src2
+        vadd.s32   q5, q2, q3    // 64 * (src0 + src2)     e0
+        vsub.s32   q2, q2, q3    // 64 * (src0 - src2)     e1
+        vmlal.s16  q4, \r3, d0[1]   // 83 * src1 + 36 * src3  o0
+        vmlsl.s16  q6, \r3, d0[0]   // 36 * src1 - 83 * src3  o1
+
+        vsub.s32   q3, q5, q4    // e0 - o0
+        vadd.s32   q4, q5, q4    // e0 + o0
+        vadd.s32   q5, q2, q6    // e1 + o1
+        vsub.s32   q6, q2, q6    // e1 - o1
+.endm
+
+.macro tr4_shift r0, r1, r2, r3, shift
+        vmull.s16  q4, \r1, d0[0]   // 83 * src1
+        vmull.s16  q6, \r1, d0[1]   // 36 * src1
+        vshll.s16  q2, \r0, #6   // 64 * src0
+        vshll.s16  q3, \r2, #6   // 64 * src2
+        vadd.s32   q5, q2, q3    // 64 * (src0 + src2)     e0
+        vsub.s32   q2, q2, q3    // 64 * (src0 - src2)     e1
+        vmlal.s16  q4, \r3, d0[1]   // 83 * src1 + 36 * src3  o0
+        vmlsl.s16  q6, \r3, d0[0]   // 36 * src1 - 83 * src3  o1
+
+        vsub.s32   q3, q5, q4    // e0 - o0
+        vadd.s32   q4, q5, q4    // e0 + o0
+        vadd.s32   q5, q2, q6    // e1 + o1
+        vsub.s32   q6, q2, q6    // e1 - o1
+
+        vqrshrn.s32   \r0, q4, \shift
+        vqrshrn.s32   \r1, q5, \shift
+        vqrshrn.s32   \r2, q6, \shift
+        vqrshrn.s32   \r3, q3, \shift
+.endm
+
+function ff_hevc_transform_4x4_neon_8, export=1
+        vpush       {d8-d15}
+        vld1.16     {q14, q15}, [r0]  // coeffs
+        ldr         r3, =0x00240053 // 36 and 83
+        vmov.32     d0[0], r3
+
+        tr4_shift d28, d29, d30, d31, #7
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        tr4_shift d28, d29, d30, d31, #12
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        vst1.16     {q14, q15}, [r0]
+        vpop        {d8-d15}
+        bx lr
+endfunc
+
+function ff_hevc_transform_luma_4x4_neon_8, export=1
+        vpush       {d8-d15}
+        vld1.16     {q14, q15}, [r0]  // coeffs
+        ldr         r3, =0x4a  // 74
+        vmov.32     d0[0], r3
+        ldr         r3, =0x1d  // 29
+        vmov.32     d0[1], r3
+        ldr         r3, =0x37  // 55
+        vmov.32     d1[0], r3
+
+        tr4_luma_shift d28, d29, d30, d31, #7
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+
+        tr4_luma_shift d28, d29, d30, d31, #12
+
+        vtrn.16     d28, d29
+        vtrn.16     d30, d31
+        vtrn.32     q14, q15
+        vst1.16     {q14, q15}, [r0]
+        vpop        {d8-d15}
+        bx lr
+endfunc
+
+.macro tr8_begin in0, in1, in2, in3
+        vmull.s16  q7, \in0, d1[1]   // 89 * src1
+        vmull.s16  q8, \in0, d1[0]   // 75 * src1
+        vmull.s16  q9, \in0, d1[3]   // 50 * src1
+        vmull.s16  q10, \in0, d1[2]  // 18 * src1
+
+        vmlal.s16  q7, \in1, d1[0]   // 75 * src3
+        vmlsl.s16  q8, \in1, d1[2]   //-18 * src3
+        vmlsl.s16  q9, \in1, d1[1]   //-89 * src3
+        vmlsl.s16  q10, \in1, d1[3]  //-50 * src3
+
+        vmlal.s16  q7, \in2, d1[3]   // 50 * src5
+        vmlsl.s16  q8, \in2, d1[1]   //-89 * src5
+        vmlal.s16  q9, \in2, d1[2]   // 18 * src5
+        vmlal.s16  q10, \in2, d1[0]  // 75 * src5
+
+        vmlal.s16  q7, \in3, d1[2]   // 18 * src7
+        vmlsl.s16  q8, \in3, d1[3]   //-50 * src7
+        vmlal.s16  q9, \in3, d1[0]   // 75 * src7
+        vmlsl.s16  q10, \in3, d1[1]  //-89 * src7
+.endm
+
+.macro tr8_end shift
+        vadd.s32   q1, q4, q7   //  e_8[0] + o_8[0], dst[0]
+        vsub.s32   q4, q4, q7   //  e_8[0] - o_8[0], dst[7]
+
+        vadd.s32   q2, q5, q8   // e_8[1] + o_8[1], dst[1]
+        vsub.s32   q5, q5, q8   // e_8[1] - o_8[1], dst[6]
+
+        vadd.s32   q11, q6, q9  // e_8[2] + o_8[2], dst[2]
+        vsub.s32    q6, q6, q9  // e_8[2] - o_8[2], dst[5]
+
+        vadd.s32   q12, q3, q10 // e_8[3] + o_8[3], dst[3]
+        vsub.s32   q3, q3, q10  // e_8[3] - o_8[3], dst[4]
+        vqrshrn.s32   d2, q1, \shift
+        vqrshrn.s32   d3, q2, \shift
+        vqrshrn.s32   d4, q11, \shift
+        vqrshrn.s32   d5, q12, \shift
+        vqrshrn.s32   d6, q3, \shift
+        vqrshrn.s32   d7, q6, \shift
+        vqrshrn.s32   d9, q4, \shift
+        vqrshrn.s32   d8, q5, \shift
+.endm
+
+function ff_hevc_transform_8x8_neon_8, export=1
+        push   {r4-r8}
+        vpush {d8-d15}
+        mov    r5, #16
+
+        adr       r3, tr4f
+        vld1.16   {d0, d1}, [r3]
+
+        // left half
+        vld1.16 {d24}, [r0], r5
+        vld1.16 {d25}, [r0], r5
+        vld1.16 {d26}, [r0], r5
+        vld1.16 {d27}, [r0], r5
+        vld1.16 {d28}, [r0], r5
+        vld1.16 {d29}, [r0], r5
+        vld1.16 {d30}, [r0], r5
+        vld1.16 {d31}, [r0], r5
+        sub      r0, #128
+        tr8_begin d25, d27, d29, d31
+        tr4       d24, d26, d28, d30
+        tr8_end   #7
+        vst1.16 {d2}, [r0], r5
+        vst1.16 {d3}, [r0], r5
+        vst1.16 {d4}, [r0], r5
+        vst1.16 {d5}, [r0], r5
+        vst1.16 {d6}, [r0], r5
+        vst1.16 {d7}, [r0], r5
+        vst1.16 {d8}, [r0], r5
+        vst1.16 {d9}, [r0], r5
+        sub      r0, #128
+        //skip right half if col_limit in r1 is less than 4
+        cmp      r1, #4
+        blt      1f
+        //right half
+        add      r0, #8
+        vld1.16 {d24}, [r0], r5
+        vld1.16 {d25}, [r0], r5
+        vld1.16 {d26}, [r0], r5
+        vld1.16 {d27}, [r0], r5
+        vld1.16 {d28}, [r0], r5
+        vld1.16 {d29}, [r0], r5
+        vld1.16 {d30}, [r0], r5
+        vld1.16 {d31}, [r0], r5
+        sub      r0, #128
+        tr8_begin d25, d27, d29, d31
+        tr4       d24, d26, d28, d30
+        tr8_end   #7
+        vst1.16 {d2}, [r0], r5
+        vst1.16 {d3}, [r0], r5
+        vst1.16 {d4}, [r0], r5
+        vst1.16 {d5}, [r0], r5
+        vst1.16 {d6}, [r0], r5
+        vst1.16 {d7}, [r0], r5
+        vst1.16 {d8}, [r0], r5
+        vst1.16 {d9}, [r0], r5
+        sub      r0, #136
+1:
+        // top half
+        vldm r0, {q12-q15} // coeffs
+        transpose_16b_4x4 d24, d26, d28, d30
+        transpose_16b_4x4 d25, d27, d29, d31
+        tr8_begin d26, d30, d27, d31
+        tr4 d24, d28, d25, d29
+        tr8_end #12
+        transpose_16b_4x4 d2, d3, d4, d5
+        transpose_16b_4x4 d6, d7, d8, d9
+        vswp     d7, d5
+        vswp     d7, d8
+        vswp     d3, d6
+        vswp     d6, d4
+        vstm r0!, {q1-q4}
+
+        // bottom half
+        vldm r0, {q12-q15} // coeffs
+        transpose_16b_4x4 d24, d26, d28, d30
+        transpose_16b_4x4 d25, d27, d29, d31
+        tr8_begin d26, d30, d27, d31
+        tr4 d24, d28, d25, d29
+        tr8_end #12
+        transpose_16b_4x4 d2, d3, d4, d5
+        transpose_16b_4x4 d6, d7, d8, d9
+        vswp     d7, d5
+        vswp     d7, d8
+        vswp     d3, d6
+        vswp     d6, d4
+        //vstm     r0, {q1-q4}
+        vst1.16 {q1-q2}, [r0]
+        add     r0, #32
+        vst1.16 {q3-q4}, [r0]
+        sub     r0, #32
+        vpop {d8-d15}
+        pop {r4-r8}
+        bx lr
+endfunc
+
+.align 4
+tr4f:
+.word 0x00240053  // 36 and d1[0] = 83
+.word 0x00000000
+tr8f:
+.word 0x0059004b  // 89, d0[0] = 75
+.word 0x00320012  // 50, d0[2] = 18
+tr16:
+.word 0x005a0057  // 90, d2[0] = 87
+.word 0x00500046  // 80, d2[2] = 70
+.word 0x0039002b  // 57, d2[0] = 43
+.word 0x00190009  // 25, d2[2] = 9
diff --git a/libavcodec/arm/hevcdsp_init_arm.c b/libavcodec/arm/hevcdsp_init_arm.c
new file mode 100644
index 0000000000..adcc454511
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_init_arm.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_arm.h"
+
+av_cold void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        ff_hevcdsp_init_neon(c, bit_depth);
+}
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
new file mode 100644
index 0000000000..1a3912c609
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/hevcdsp.h"
+#include "hevcdsp_arm.h"
+
+void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
+void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+
+#define PUT_PIXELS(name) \
+    void name(int16_t *dst, uint8_t *src, \
+                                ptrdiff_t srcstride, int height, \
+                                intptr_t mx, intptr_t my, int width)
+PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+#undef PUT_PIXELS
+
+static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, int width);
+static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width);
+#define QPEL_FUNC(name) \
+    void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
+                                   int height, int width)
+
+QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
+QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
+#undef QPEL_FUNC
+
+#define QPEL_FUNC_UW_PIX(name) \
+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+                                   int height, intptr_t mx, intptr_t my, int width);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w4_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w8_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w16_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w24_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w32_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w48_neon_8);
+QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w64_neon_8);
+#undef QPEL_FUNC_UW_PIX
+
+#define QPEL_FUNC_UW(name) \
+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
+                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
+#undef QPEL_FUNC_UW
+
+void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width) {
+
+    put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
+}
+
+void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                   int height, intptr_t mx, intptr_t my, int width) {
+
+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
+}
+
+void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                       int16_t *src2,
+                                       int height, intptr_t mx, intptr_t my, int width) {
+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+}
+
+av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+{
+    if (bit_depth == 8) {
+        int x;
+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
+        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
+        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
+        c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
+        c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
+        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
+        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_8;
+        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_8;
+        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_8;
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_neon_8;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_neon_8;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_neon_8;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_neon_8;
+        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
+        put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
+        put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
+        put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
+        put_hevc_qpel_neon[0][1]       = ff_hevc_put_qpel_h1_neon_8;
+        put_hevc_qpel_neon[0][2]       = ff_hevc_put_qpel_h2_neon_8;
+        put_hevc_qpel_neon[0][3]       = ff_hevc_put_qpel_h3_neon_8;
+        put_hevc_qpel_neon[1][1]       = ff_hevc_put_qpel_h1v1_neon_8;
+        put_hevc_qpel_neon[1][2]       = ff_hevc_put_qpel_h2v1_neon_8;
+        put_hevc_qpel_neon[1][3]       = ff_hevc_put_qpel_h3v1_neon_8;
+        put_hevc_qpel_neon[2][1]       = ff_hevc_put_qpel_h1v2_neon_8;
+        put_hevc_qpel_neon[2][2]       = ff_hevc_put_qpel_h2v2_neon_8;
+        put_hevc_qpel_neon[2][3]       = ff_hevc_put_qpel_h3v2_neon_8;
+        put_hevc_qpel_neon[3][1]       = ff_hevc_put_qpel_h1v3_neon_8;
+        put_hevc_qpel_neon[3][2]       = ff_hevc_put_qpel_h2v3_neon_8;
+        put_hevc_qpel_neon[3][3]       = ff_hevc_put_qpel_h3v3_neon_8;
+        put_hevc_qpel_uw_neon[1][0]      = ff_hevc_put_qpel_uw_v1_neon_8;
+        put_hevc_qpel_uw_neon[2][0]      = ff_hevc_put_qpel_uw_v2_neon_8;
+        put_hevc_qpel_uw_neon[3][0]      = ff_hevc_put_qpel_uw_v3_neon_8;
+        put_hevc_qpel_uw_neon[0][1]      = ff_hevc_put_qpel_uw_h1_neon_8;
+        put_hevc_qpel_uw_neon[0][2]      = ff_hevc_put_qpel_uw_h2_neon_8;
+        put_hevc_qpel_uw_neon[0][3]      = ff_hevc_put_qpel_uw_h3_neon_8;
+        put_hevc_qpel_uw_neon[1][1]      = ff_hevc_put_qpel_uw_h1v1_neon_8;
+        put_hevc_qpel_uw_neon[1][2]      = ff_hevc_put_qpel_uw_h2v1_neon_8;
+        put_hevc_qpel_uw_neon[1][3]      = ff_hevc_put_qpel_uw_h3v1_neon_8;
+        put_hevc_qpel_uw_neon[2][1]      = ff_hevc_put_qpel_uw_h1v2_neon_8;
+        put_hevc_qpel_uw_neon[2][2]      = ff_hevc_put_qpel_uw_h2v2_neon_8;
+        put_hevc_qpel_uw_neon[2][3]      = ff_hevc_put_qpel_uw_h3v2_neon_8;
+        put_hevc_qpel_uw_neon[3][1]      = ff_hevc_put_qpel_uw_h1v3_neon_8;
+        put_hevc_qpel_uw_neon[3][2]      = ff_hevc_put_qpel_uw_h2v3_neon_8;
+        put_hevc_qpel_uw_neon[3][3]      = ff_hevc_put_qpel_uw_h3v3_neon_8;
+        for (x = 0; x < 10; x++) {
+            c->put_hevc_qpel[x][1][0]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel[x][0][1]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel[x][1][1]         = ff_hevc_put_qpel_neon_wrapper;
+            c->put_hevc_qpel_uni[x][1][0]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_uni[x][0][1]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_uni[x][1][1]     = ff_hevc_put_qpel_uni_neon_wrapper;
+            c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+            c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+        }
+        c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+        c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+        c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+        c->put_hevc_qpel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
+        c->put_hevc_qpel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
+        c->put_hevc_qpel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
+        c->put_hevc_qpel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
+        c->put_hevc_qpel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
+        c->put_hevc_qpel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
+        c->put_hevc_qpel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
+
+        c->put_hevc_qpel_uni[1][0][0]  = ff_hevc_put_qpel_uw_pixels_w4_neon_8;
+        c->put_hevc_qpel_uni[3][0][0]  = ff_hevc_put_qpel_uw_pixels_w8_neon_8;
+        c->put_hevc_qpel_uni[5][0][0]  = ff_hevc_put_qpel_uw_pixels_w16_neon_8;
+        c->put_hevc_qpel_uni[6][0][0]  = ff_hevc_put_qpel_uw_pixels_w24_neon_8;
+        c->put_hevc_qpel_uni[7][0][0]  = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
+        c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+        c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+    }
+}
diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S b/libavcodec/arm/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..86f92cf75a
--- /dev/null
+++ b/libavcodec/arm/hevcdsp_qpel_neon.S
@@ -0,0 +1,999 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+
+.macro regshuffle_d8
+    vmov d16, d17
+    vmov d17, d18
+    vmov d18, d19
+    vmov d19, d20
+    vmov d20, d21
+    vmov d21, d22
+    vmov d22, d23
+.endm
+
+.macro regshuffle_q8
+    vmov q0, q1
+    vmov q1, q2
+    vmov q2, q3
+    vmov q3, q4
+    vmov q4, q5
+    vmov q5, q6
+    vmov q6, q7
+.endm
+
+.macro vextin8
+        pld       [r2]
+        vld1.8    {q11}, [r2], r3
+        vext.8    d16, d22, d23, #1
+        vext.8    d17, d22, d23, #2
+        vext.8    d18, d22, d23, #3
+        vext.8    d19, d22, d23, #4
+        vext.8    d20, d22, d23, #5
+        vext.8    d21, d22, d23, #6
+        vext.8    d22, d22, d23, #7
+.endm
+
+.macro loadin8
+        pld       [r2]
+        vld1.8    {d16}, [r2], r3
+        pld       [r2]
+        vld1.8    {d17}, [r2], r3
+        pld       [r2]
+        vld1.8    {d18}, [r2], r3
+        pld       [r2]
+        vld1.8    {d19}, [r2], r3
+        pld       [r2]
+        vld1.8    {d20}, [r2], r3
+        pld       [r2]
+        vld1.8    {d21}, [r2], r3
+        pld       [r2]
+        vld1.8    {d22}, [r2], r3
+        pld       [r2]
+        vld1.8    {d23}, [r2], r3
+.endm
+
+.macro qpel_filter_1_32b
+        vmov.i16   d16, #58
+        vmov.i16   d17, #10
+        vmull.s16   q9, d6, d16   // 58 * d0
+        vmull.s16  q10, d7, d16   // 58 * d1
+        vmov.i16   d16, #17
+        vmull.s16  q11, d4, d17   // 10 * c0
+        vmull.s16  q12, d5, d17   // 10 * c1
+        vmov.i16   d17, #5
+        vmull.s16  q13, d8, d16   // 17 * e0
+        vmull.s16  q14, d9, d16   // 17 * e1
+        vmull.s16  q15, d10, d17  //  5 * f0
+        vmull.s16   q8, d11, d17  //  5 * f1
+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
+        vshll.s16  q11, d2, #2    // 4 * b0
+        vshll.s16  q12, d3, #2    // 4 * b1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
+        vsubl.s16  q13, d12, d0   // g0 - a0
+        vsubl.s16  q14, d13, d1   // g1 - a1
+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+// input  q0 - q7
+// output q8
+.macro qpel_filter_2_32b
+        vmov.i32   q8, #11
+        vaddl.s16   q9, d6, d8   // d0 + e0
+        vaddl.s16  q10, d7, d9   // d1 + e1
+        vaddl.s16  q11, d4, d10  // c0 + f0
+        vaddl.s16  q12, d5, d11  // c1 + f1
+        vmul.s32   q11, q8       // 11 * (c0 + f0)
+        vmul.s32   q12, q8       // 11 * (c1 + f1)
+        vmov.i32   q8, #40
+        vaddl.s16  q15, d2, d12  // b0 + g0
+        vmul.s32    q9, q8       // 40 * (d0 + e0)
+        vmul.s32   q10, q8       // 40 * (d1 + e1)
+        vaddl.s16   q8, d3, d13  // b1 + g1
+        vaddl.s16  q13, d0, d14  // a0 + h0
+        vaddl.s16  q14, d1, d15  // a1 + h1
+        vshl.s32   q15, #2       // 4*(b0+g0)
+        vshl.s32    q8, #2       // 4*(b1+g1)
+        vadd.s32   q11, q13      // 11 * (c0 + f0) + a0 + h0
+        vadd.s32   q12, q14      // 11 * (c1 + f1) + a1 + h1
+        vadd.s32   q9, q15       // 40 * (d0 + e0) + 4*(b0+g0)
+        vadd.s32   q10, q8       // 40 * (d1 + e1) + 4*(b1+g1)
+        vsub.s32   q9, q11       // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
+        vsub.s32   q10, q12      // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+.macro qpel_filter_3_32b
+        vmov.i16   d16, #58
+        vmov.i16   d17, #10
+        vmull.s16   q9, d8, d16   // 58 * d0
+        vmull.s16  q10, d9, d16   // 58 * d1
+        vmov.i16   d16, #17
+        vmull.s16  q11, d10, d17  // 10 * c0
+        vmull.s16  q12, d11, d17  // 10 * c1
+        vmov.i16   d17, #5
+        vmull.s16  q13, d6, d16   // 17 * e0
+        vmull.s16  q14, d7, d16   // 17 * e1
+        vmull.s16  q15, d4, d17   //  5 * f0
+        vmull.s16   q8, d5, d17   //  5 * f1
+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
+        vshll.s16  q11, d12, #2   // 4 * b0
+        vshll.s16  q12, d13, #2   // 4 * b1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
+        vsubl.s16  q13, d2, d14   // g0 - a0
+        vsubl.s16  q14, d3, d15   // g1 - a1
+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+.macro qpel_filter_1 out=q7
+        vmov.u8    d24, #58
+        vmov.u8    d25, #10
+        vshll.u8   q13, d20, #4   // 16*e
+        vshll.u8   q14, d21, #2   // 4*f
+        vmull.u8  \out, d19, d24  // 58*d
+        vaddw.u8   q13, q13, d20  // 17*e
+        vmull.u8   q15, d18, d25  // 10*c
+        vaddw.u8   q14, q14, d21  // 5*f
+        vsubl.u8   q12, d22, d16  // g - a
+        vadd.u16  \out, q13       // 58d + 17e
+        vshll.u8   q13, d17, #2   // 4*b
+        vadd.u16   q15, q14       // 10*c + 5*f
+        vadd.s16   q13, q12       // - a + 4*b + g
+        vsub.s16  \out, q15       // -10*c + 58*d + 17*e -5*f
+        vadd.s16  \out, q13       // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro qpel_filter_2 out=q7
+        vmov.i16   q12, #10
+        vmov.i16   q14, #11
+        vaddl.u8   q13, d19, d20   // d + e
+        vaddl.u8   q15, d18, d21   // c + f
+        vmul.u16   q13, q12        // 10 * (d+e)
+        vmul.u16   q15, q14        // 11 * ( c + f)
+        vaddl.u8  \out, d17, d22   // b + g
+        vaddl.u8   q12, d16, d23   // a + h
+        vadd.u16  \out, q13        // b + 10 * (d + e) + g
+        vadd.s16   q12, q15
+        vshl.u16  \out, #2         // 4 * (b + 10 * (d + e) + g)
+        vsub.s16  \out, q12
+.endm
+
+.macro qpel_filter_3 out=q7
+        vmov.u8    d24, #58
+        vmov.u8    d25, #10
+        vshll.u8   q13, d19, #4     // 16*e
+        vshll.u8   q14, d18, #2     // 4*f
+        vmull.u8  \out, d20, d24    // 58*d
+        vaddw.u8   q13, q13, d19    // 17*e
+        vmull.u8   q15, d21, d25    // 10*c
+        vaddw.u8   q14, q14, d18    // 5*f
+        vsubl.u8   q12, d17, d23    // g - a
+        vadd.u16  \out, q13         // 58d + 17e
+        vshll.u8   q13, d22, #2     // 4*b
+        vadd.u16   q15, q14         // 10*c + 5*f
+        vadd.s16   q13, q12         // - a + 4*b + g
+        vsub.s16  \out, q15         // -10*c + 58*d + 17*e -5*f
+        vadd.s16  \out, q13         // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro  hevc_put_qpel_vX_neon_8 filter
+        push   {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+        vpush {d8-d15}
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        lsl       r1, #1
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vst1.16    {q7}, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #16
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vst1.16    d14, [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro  hevc_put_qpel_uw_vX_neon_8 filter
+        push   {r4-r10}
+        ldr    r5, [sp, #28] // width
+        ldr    r4, [sp, #32] // height
+        ldr    r8, [sp, #36] // src2
+        ldr    r9, [sp, #40] // src2stride
+        vpush {d8-d15}
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.8    d0, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.32    d0[0], [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+        b   99f
+.Lbi\@: lsl       r9, #1
+        mov       r10, r8
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q7
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r10, #16
+        mov r8, r10
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d14
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4-r10}
+        bx lr
+.endm
+
+function ff_hevc_put_qpel_v1_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_v2_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_v3_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_v1_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_v2_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_v3_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hX_neon_8 filter
+        push     {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+
+        vpush    {d8-d15}
+        sub       r2, #4
+        lsl       r1, #1
+        mov      r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r5, #4
+        beq       4f
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vst1.16   {q7}, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vst1.16  d14, [r0], r1
+        bne       4b
+99:     vpop     {d8-d15}
+        pop      {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hX_neon_8 filter
+        push     {r4-r10}
+        ldr       r5, [sp, #28] // width
+        ldr       r4, [sp, #32] // height
+        ldr       r8, [sp, #36] // src2
+        ldr       r9, [sp, #40] // src2stride
+        vpush    {d8-d15}
+        sub       r2, #4
+        mov      r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+        cmp       r5, #4
+        beq       4f
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.8    d0, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.32  d0[0], [r0], r1
+        bne       4b
+        b         99f
+.Lbi\@:
+        lsl       r9, #1
+        cmp       r5, #4
+        beq       4f
+        mov       r10, r8
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q7
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #8
+        add       r10, #16
+        mov       r8, r10
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d14
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        bne       4b
+99:     vpop     {d8-d15}
+        pop      {r4-r10}
+        bx lr
+.endm
+
+function ff_hevc_put_qpel_h1_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_h2_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_h3_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_h2_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_h3_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
+        push   {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+
+        vpush {d8-d15}
+        sub       r2, #4
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3  // extra_before 3
+        lsl       r1, #1
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vst1.16    {q8}, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #16
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vst1.16    d16, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
+        push     {r4-r10}
+        ldr       r5, [sp, #28] // width
+        ldr       r4, [sp, #32] // height
+        ldr       r8, [sp, #36] // src2
+        ldr       r9, [sp, #40] // src2stride
+        vpush {d8-d15}
+        sub       r2, #4
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3  // extra_before 3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vqrshrun.s16   d0, q8, #6
+        vst1.8    d0, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vqrshrun.s16   d0, q8, #6
+        vst1.32        d0[0], [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+        b   99f
+.Lbi\@: lsl      r9, #1
+        mov      r10, r8
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q8
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r10, #16
+        mov r8, r10
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d16
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4-r10}
+        bx lr
+.endm
+
+
+function ff_hevc_put_qpel_h1v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+.macro init_put_pixels
+        pld    [r1]
+        pld    [r1, r2]
+        mov    r12, MAX_PB_SIZE
+        lsl    r12, #1
+.endm
+
+function ff_hevc_put_pixels_w2_neon_8, export=1
+        init_put_pixels
+        vmov.u8      d5, #255
+        vshr.u64     d5, #32
+0:      subs r3, #1
+        vld1.32     {d0[0]}, [r1], r2
+        pld [r1]
+        vld1.32     d6, [r0]
+        vshll.u8    q0, d0, #6
+        vbit        d6, d0, d5
+        vst1.32     d6, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w4_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.32   {d0[0]}, [r1], r2
+        vld1.32   {d0[1]}, [r1], r2
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8   q0, d0, #6
+        vst1.64   {d0}, [r0], r12
+        vst1.64   {d1}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w6_neon_8, export=1
+        init_put_pixels
+        vmov.u8      q10, #255
+        vshr.u64     d21, #32
+0:      subs r3, #1
+        vld1.16     {d0}, [r1], r2
+        pld [r1]
+        vshll.u8    q0, d0, #6
+        vld1.8      {q12}, [r0]
+        vbit        q12, q0, q10
+        vst1.8      {q12}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w8_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.8   {d0}, [r1], r2
+        vld1.8   {d2}, [r1], r2
+        pld        [r1]
+        pld        [r1, r2]
+        vshll.u8   q0, d0, #6
+        vshll.u8   q1, d2, #6
+        vst1.16   {q0}, [r0], r12
+        vst1.16   {q1}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w12_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.64    {d0}, [r1]
+        add       r1, #8
+        vld1.32   {d1[0]}, [r1], r2
+        sub       r1, #8
+        vld1.64    {d2}, [r1]
+        add       r1, #8
+        vld1.32   {d1[1]}, [r1], r2
+        sub       r1, #8
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vmov      d22, d19
+        vst1.64   {d16, d17, d18}, [r0], r12
+        vst1.64   {d20, d21, d22}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w16_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.8   {q0}, [r1], r2
+        vld1.8   {q1}, [r1], r2
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vst1.8    {q8, q9}, [r0], r12
+        vst1.8    {q10, q11}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w24_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8   {d0, d1, d2}, [r1], r2
+        pld       [r1]
+        vshll.u8  q10, d0, #6
+        vshll.u8  q11, d1, #6
+        vshll.u8  q12, d2, #6
+        vstm     r0, {q10, q11, q12}
+        add      r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w32_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8 {q0, q1}, [r1], r2
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vstm    r0, {q8, q9, q10, q11}
+        add     r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w48_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8    {q0, q1}, [r1]
+        add r1, #32
+        vld1.8    {q2}, [r1], r2
+        sub r1, #32
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vshll.u8  q12, d4, #6
+        vshll.u8  q13, d5, #6
+        vstm r0, {q8, q9, q10, q11, q12, q13}
+        add  r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w64_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8    {q0, q1}, [r1]
+        add      r1, #32
+        vld1.8    {q2, q3}, [r1], r2
+        sub      r1, #32
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vshll.u8  q12, d4, #6
+        vshll.u8  q13, d5, #6
+        vshll.u8  q14, d6, #6
+        vshll.u8  q15, d7, #6
+        vstm    r0, {q8, q9, q10, q11, q12, q13, q14, q15}
+        add r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_qpel_uw_pixels_neon_8, export=1
+        push   {r4-r9}
+        ldr    r5, [sp, #24] // width
+        ldr    r4, [sp, #28] // height
+        ldr    r8, [sp, #32] // src2
+        ldr    r9, [sp, #36] // src2stride
+        vpush {d8-d15}
+        cmp    r8, #0
+        bne    2f
+1:      subs r4, #1
+        vld1.8     {d0}, [r2], r3
+        vst1.8      d0, [r0], r1
+        bne 1b
+        vpop {d8-d15}
+        pop   {r4-r9}
+        bx lr
+2:      subs  r4, #1
+        vld1.8         {d0}, [r2], r3
+        vld1.16        {q1}, [r8], r9
+        vshll.u8       q0, d0, #6
+        vqadd.s16      q0, q1
+        vqrshrun.s16   d0, q0, #7
+        vst1.8      d0, [r0], r1
+        bne 2b
+        vpop {d8-d15}
+        pop   {r4-r9}
+        bx lr
+endfunc
+
+.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+        ldr    r12, [sp] // height
+1:      subs   r12, #4
+        vld1.32     {\regs}  , [r2], r3
+        vld1.32     {\regs2} , [r2], r3
+        vld1.32     {\regs3} , [r2], r3
+        vld1.32     {\regs4} , [r2], r3
+        vst1.32     {\regs}  , [r0], r1
+        vst1.32     {\regs2} , [r0], r1
+        vst1.32     {\regs3} , [r0], r1
+        vst1.32     {\regs4} , [r0], r1
+        bne 1b
+        bx lr
+endfunc
+.endm
+
+.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+        push   {r4-r5}
+        ldr    r12, [sp, #8] // height
+1:      subs r12, #2
+        mov      r4, r2
+        vld1.32   {\regs} , [r2]!
+        vld1.32   {\regs2} , [r2]
+        add      r2, r4, r3
+        mov      r4, r2
+        vld1.32   {\regs3} , [r2]!
+        vld1.32   {\regs4} , [r2]
+        add      r2, r4, r3
+        mov      r5, r0
+        vst1.32   {\regs} , [r0]!
+        vst1.32   {\regs2} , [r0]
+        add      r0, r5, r1
+        mov      r5, r0
+        vst1.32   {\regs3} , [r0]!
+        vst1.32   {\regs4} , [r0]
+        add      r0, r5, r1
+        bne 1b
+        pop   {r4-r5}
+        bx lr
+endfunc
+.endm
+
+put_qpel_uw_pixels    4, d0[0], d0[1], d1[0], d1[1]
+put_qpel_uw_pixels    8, d0,    d1,    d2,    d3
+put_qpel_uw_pixels_m 12, d0,    d1[0], d2,    d3[0]
+put_qpel_uw_pixels   16, q0,    q1,    q2,    q3
+put_qpel_uw_pixels   24, d0-d2, d3-d5, d16-d18, d19-d21
+put_qpel_uw_pixels   32, q0-q1, q2-q3, q8-q9, q10-q11
+put_qpel_uw_pixels_m 48, q0-q1, q2,    q8-q9, q10
+put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11
diff --git a/libavcodec/arm/hpeldsp_arm.S b/libavcodec/arm/hpeldsp_arm.S
index 0f8092e15e..219f793d99 100644
--- a/libavcodec/arm/hpeldsp_arm.S
+++ b/libavcodec/arm/hpeldsp_arm.S
@@ -2,20 +2,20 @@
 @ ARMv4-optimized halfpel functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg.
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
diff --git a/libavcodec/arm/hpeldsp_arm.h b/libavcodec/arm/hpeldsp_arm.h
index a8641529d5..5f3c7741c1 100644
--- a/libavcodec/arm/hpeldsp_arm.h
+++ b/libavcodec/arm/hpeldsp_arm.h
@@ -1,18 +1,20 @@
 /*
- * This file is part of Libav.
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * Libav is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_armv6.S b/libavcodec/arm/hpeldsp_armv6.S
index f1abc328eb..a8bd459c20 100644
--- a/libavcodec/arm/hpeldsp_armv6.S
+++ b/libavcodec/arm/hpeldsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_arm.c b/libavcodec/arm/hpeldsp_init_arm.c
index 63906606a2..1977b1379b 100644
--- a/libavcodec/arm/hpeldsp_init_arm.c
+++ b/libavcodec/arm/hpeldsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM-optimized halfpel functions
  * Copyright (c) 2001 Lionel Ulmer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_armv6.c b/libavcodec/arm/hpeldsp_init_armv6.c
index 67a500d513..967a8e0427 100644
--- a/libavcodec/arm/hpeldsp_init_armv6.c
+++ b/libavcodec/arm/hpeldsp_init_armv6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_init_neon.c b/libavcodec/arm/hpeldsp_init_neon.c
index 76d4eafceb..d9feadd1dd 100644
--- a/libavcodec/arm/hpeldsp_init_neon.c
+++ b/libavcodec/arm/hpeldsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/hpeldsp_neon.S b/libavcodec/arm/hpeldsp_neon.S
index 90bc3cb8ae..cf4a6cfb8d 100644
--- a/libavcodec/arm/hpeldsp_neon.S
+++ b/libavcodec/arm/hpeldsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idct.h b/libavcodec/arm/idct.h
index 168d64b666..39cef3a874 100644
--- a/libavcodec/arm/idct.h
+++ b/libavcodec/arm/idct.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_arm.S b/libavcodec/arm/idctdsp_arm.S
index 34f467e86f..057eff9be8 100644
--- a/libavcodec/arm/idctdsp_arm.S
+++ b/libavcodec/arm/idctdsp_arm.S
@@ -2,27 +2,27 @@
 @ ARMv4-optimized IDCT functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg.
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
 #include "config.h"
 #include "libavutil/arm/asm.S"
 
-@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
+@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, ptrdiff_t stride)
 function ff_add_pixels_clamped_arm, export=1, align=5
         push            {r4-r10}
         mov             r10, #8
diff --git a/libavcodec/arm/idctdsp_arm.h b/libavcodec/arm/idctdsp_arm.h
index 9012b82904..d7bc5cd02a 100644
--- a/libavcodec/arm/idctdsp_arm.h
+++ b/libavcodec/arm/idctdsp_arm.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_armv6.S b/libavcodec/arm/idctdsp_armv6.S
index c180d732fa..a6e77d6da1 100644
--- a/libavcodec/arm/idctdsp_armv6.S
+++ b/libavcodec/arm/idctdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/idctdsp_init_arm.c b/libavcodec/arm/idctdsp_init_arm.c
index 8207c31589..0068e3f86c 100644
--- a/libavcodec/arm/idctdsp_init_arm.c
+++ b/libavcodec/arm/idctdsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM-optimized IDCT functions
  * Copyright (c) 2001 Lionel Ulmer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -30,7 +30,7 @@
 #include "idctdsp_arm.h"
 
 void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
-                               int line_size);
+                               ptrdiff_t line_size);
 
 /* XXX: those functions should be suppressed ASAP when all IDCTs are
  * converted */
@@ -63,8 +63,8 @@ av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+    if (!avctx->lowres && !high_bit_depth) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             avctx->idct_algo == FF_IDCT_ARM) {
             c->idct_put  = j_rev_dct_arm_put;
             c->idct_add  = j_rev_dct_arm_add;
diff --git a/libavcodec/arm/idctdsp_init_armv5te.c b/libavcodec/arm/idctdsp_init_armv5te.c
index 251165dd74..3d881e1f18 100644
--- a/libavcodec/arm/idctdsp_init_armv5te.c
+++ b/libavcodec/arm/idctdsp_init_armv5te.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -29,8 +29,9 @@
 av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
                                      unsigned high_bit_depth)
 {
-    if (!high_bit_depth &&
+    if (!avctx->lowres && !high_bit_depth &&
         (avctx->idct_algo == FF_IDCT_AUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
          avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
         c->idct_put  = ff_simple_idct_put_armv5te;
         c->idct_add  = ff_simple_idct_add_armv5te;
diff --git a/libavcodec/arm/idctdsp_init_armv6.c b/libavcodec/arm/idctdsp_init_armv6.c
index 8f0c49b142..edf3070e15 100644
--- a/libavcodec/arm/idctdsp_init_armv6.c
+++ b/libavcodec/arm/idctdsp_init_armv6.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,13 +27,13 @@
 #include "idctdsp_arm.h"
 
 void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels,
-                                 int line_size);
+                                 ptrdiff_t line_size);
 
 av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
                                    unsigned high_bit_depth)
 {
-    if (!high_bit_depth) {
-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+    if (!avctx->lowres && !high_bit_depth) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
             avctx->idct_algo == FF_IDCT_SIMPLEARMV6) {
             c->idct_put  = ff_simple_idct_put_armv6;
             c->idct_add  = ff_simple_idct_add_armv6;
diff --git a/libavcodec/arm/idctdsp_init_neon.c b/libavcodec/arm/idctdsp_init_neon.c
index c94f7b6e5d..b70c5b0d44 100644
--- a/libavcodec/arm/idctdsp_init_neon.c
+++ b/libavcodec/arm/idctdsp_init_neon.c
@@ -2,20 +2,20 @@
  * ARM-NEON-optimized IDCT functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,15 +27,16 @@
 #include "idct.h"
 #include "idctdsp_arm.h"
 
-void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
 
 av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
                                   unsigned high_bit_depth)
 {
-    if (!high_bit_depth) {
+    if (!avctx->lowres && !high_bit_depth) {
         if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
             avctx->idct_algo == FF_IDCT_SIMPLENEON) {
             c->idct_put  = ff_simple_idct_put_neon;
             c->idct_add  = ff_simple_idct_add_neon;
diff --git a/libavcodec/arm/idctdsp_neon.S b/libavcodec/arm/idctdsp_neon.S
index 7095879bae..1911a33468 100644
--- a/libavcodec/arm/idctdsp_neon.S
+++ b/libavcodec/arm/idctdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM-NEON-optimized IDCT functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S
index 42f37392e1..72c4c77c45 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -1,21 +1,21 @@
 /*
  * ARM NEON optimised integer operations
- * Copyright (c) 2009 Kostya Shishkov
+ * Copyright (c) 2009 Konstantin Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -35,7 +35,7 @@ function ff_scalarproduct_int16_neon, export=1
         vmlal.s16       q2,  d18,  d22
         vmlal.s16       q3,  d19,  d23
         subs            r2,  r2,   #16
-        bne             1b
+        bgt             1b
 
         vpadd.s32       d16, d0,   d1
         vpadd.s32       d17, d2,   d3
@@ -48,3 +48,4 @@ function ff_scalarproduct_int16_neon, export=1
         vmov.32         r0,  d3[0]
         bx              lr
 endfunc
+
diff --git a/libavcodec/arm/apedsp_init_arm.c b/libavcodec/arm/lossless_audiodsp_init_arm.c
index 47ea034359..981a39aff9 100644
--- a/libavcodec/arm/apedsp_init_arm.c
+++ b/libavcodec/arm/lossless_audiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -23,12 +23,12 @@
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/arm/cpu.h"
-#include "libavcodec/apedsp.h"
+#include "libavcodec/lossless_audiodsp.h"
 
 int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
                                              const int16_t *v3, int len, int mul);
 
-av_cold void ff_apedsp_init_arm(APEDSPContext *c)
+av_cold void ff_llauddsp_init_arm(LLAudDSPContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
 
diff --git a/libavcodec/arm/apedsp_neon.S b/libavcodec/arm/lossless_audiodsp_neon.S
index 7cfbf43c6d..ba7c45fcef 100644
--- a/libavcodec/arm/apedsp_neon.S
+++ b/libavcodec/arm/lossless_audiodsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised integer operations
  * Copyright (c) 2009 Kostya Shishkov
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -47,7 +47,7 @@ function ff_scalarproduct_and_madd_int16_neon, export=1
         vst1.16         {q10},     [r12,:128]!
         subs            r3,  r3,   #16
         vst1.16         {q13},     [r12,:128]!
-        bne             1b
+        bgt             1b
 
         vpadd.s32       d16, d0,   d1
         vpadd.s32       d17, d2,   d3
diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h
index 45ac67d436..dc57c5571c 100644
--- a/libavcodec/arm/mathops.h
+++ b/libavcodec/arm/mathops.h
@@ -2,20 +2,20 @@
  * simple math operations
  * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_fixed_init_arm.c b/libavcodec/arm/mdct_fixed_init_arm.c
deleted file mode 100644
index 606c80cbf4..0000000000
--- a/libavcodec/arm/mdct_fixed_init_arm.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-
-#define FFT_FLOAT 0
-#include "libavcodec/fft.h"
-
-void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
-void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
-
-av_cold void ff_mdct_fixed_init_arm(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_neon(cpu_flags)) {
-        if (!s->inverse && s->nbits >= 3) {
-            s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
-            s->mdct_calc        = ff_mdct_fixed_calc_neon;
-            s->mdct_calcw       = ff_mdct_fixed_calcw_neon;
-        }
-    }
-}
diff --git a/libavcodec/arm/mdct_fixed_neon.S b/libavcodec/arm/mdct_fixed_neon.S
index c77be59c65..365c5e7faf 100644
--- a/libavcodec/arm/mdct_fixed_neon.S
+++ b/libavcodec/arm/mdct_fixed_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_init_arm.c b/libavcodec/arm/mdct_init_arm.c
deleted file mode 100644
index 24678dd8d0..0000000000
--- a/libavcodec/arm/mdct_init_arm.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-
-#include "libavcodec/fft.h"
-
-void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-av_cold void ff_mdct_init_arm(FFTContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_vfp_vm(cpu_flags)) {
-        s->imdct_half   = ff_imdct_half_vfp;
-    }
-
-    if (have_neon(cpu_flags)) {
-        s->imdct_calc   = ff_imdct_calc_neon;
-        s->imdct_half   = ff_imdct_half_neon;
-        s->mdct_calc    = ff_mdct_calc_neon;
-        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
-    }
-}
diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
index bfe259c396..a6952fa571 100644
--- a/libavcodec/arm/mdct_neon.S
+++ b/libavcodec/arm/mdct_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised MDCT
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
index f3fe668eae..43f6d14c0c 100644
--- a/libavcodec/arm/mdct_vfp.S
+++ b/libavcodec/arm/mdct_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/me_cmp_armv6.S b/libavcodec/arm/me_cmp_armv6.S
index 436e20dd25..fa5a82301e 100644
--- a/libavcodec/arm/me_cmp_armv6.S
+++ b/libavcodec/arm/me_cmp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/me_cmp_init_arm.c b/libavcodec/arm/me_cmp_init_arm.c
index 4d73f3e0fd..03870a2bfa 100644
--- a/libavcodec/arm/me_cmp_init_arm.c
+++ b/libavcodec/arm/me_cmp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_armv5te.S b/libavcodec/arm/mlpdsp_armv5te.S
index 4272dae029..4f9aa485fd 100644
--- a/libavcodec/arm/mlpdsp_armv5te.S
+++ b/libavcodec/arm/mlpdsp_armv5te.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S
index de9db466a5..b7ecf6cfae 100644
--- a/libavcodec/arm/mlpdsp_armv6.S
+++ b/libavcodec/arm/mlpdsp_armv6.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index 4cdd10caf5..34a5f61e1d 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -2,20 +2,20 @@
  * Copyright (c) 2014 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
index 49bd0bcaf2..977abb6939 100644
--- a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
+++ b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegaudiodsp_init_arm.c b/libavcodec/arm/mpegaudiodsp_init_arm.c
index e73aee6a2b..98e0c8a3a8 100644
--- a/libavcodec/arm/mpegaudiodsp_init_arm.c
+++ b/libavcodec/arm/mpegaudiodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index 34e9cf18b5..918be16d03 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2002 Michael Niedermayer
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_arm.h b/libavcodec/arm/mpegvideo_arm.h
index 17e3a5b024..709ae6b247 100644
--- a/libavcodec/arm/mpegvideo_arm.h
+++ b/libavcodec/arm/mpegvideo_arm.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_armv5te.c b/libavcodec/arm/mpegvideo_armv5te.c
index 4bb7b6e025..e20bb4c645 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -2,24 +2,25 @@
  * Optimization of some functions from mpegvideo.c for armv5te
  * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideo.h"
 #include "mpegvideo_arm.h"
@@ -55,7 +56,7 @@ static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
     int level, qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qmul = qscale << 1;
 
@@ -84,7 +85,7 @@ static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
     int qmul, qadd;
     int nCoeffs;
 
-    assert(s->block_last_index[n]>=0);
+    av_assert2(s->block_last_index[n]>=0);
 
     qadd = (qscale - 1) | 1;
     qmul = qscale << 1;
diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S
index 4426e15e91..8687d6b31c 100644
--- a/libavcodec/arm/mpegvideo_armv5te_s.S
+++ b/libavcodec/arm/mpegvideo_armv5te_s.S
@@ -2,20 +2,20 @@
  * Optimization of some functions from mpegvideo.c for armv5te
  * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index 3e1f7b53e2..1889d7a912 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideoencdsp_armv6.S b/libavcodec/arm/mpegvideoencdsp_armv6.S
index 99db501b25..ab0dad7b18 100644
--- a/libavcodec/arm/mpegvideoencdsp_armv6.S
+++ b/libavcodec/arm/mpegvideoencdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/mpegvideoencdsp_init_arm.c b/libavcodec/arm/mpegvideoencdsp_init_arm.c
index ab9ba3e1be..4bfe835684 100644
--- a/libavcodec/arm/mpegvideoencdsp_init_arm.c
+++ b/libavcodec/arm/mpegvideoencdsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/neon.S b/libavcodec/arm/neon.S
index 716a607af7..787bc4bf36 100644
--- a/libavcodec/arm/neon.S
+++ b/libavcodec/arm/neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/neontest.c b/libavcodec/arm/neontest.c
index 692576ee45..56f950abe0 100644
--- a/libavcodec/arm/neontest.c
+++ b/libavcodec/arm/neontest.c
@@ -2,20 +2,20 @@
  * check NEON registers for clobbers
  * Copyright (c) 2013 Martin Storsjo
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -77,3 +77,23 @@ wrap(avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt,
 {
     testneonclobbers(avcodec_encode_video2, avctx, avpkt, frame, got_packet_ptr);
 }
+
+wrap(avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt))
+{
+    testneonclobbers(avcodec_send_packet, avctx, avpkt);
+}
+
+wrap(avcodec_receive_frame(AVCodecContext *avctx, AVFrame *frame))
+{
+    testneonclobbers(avcodec_receive_frame, avctx, frame);
+}
+
+wrap(avcodec_send_frame(AVCodecContext *avctx, const AVFrame *frame))
+{
+    testneonclobbers(avcodec_send_frame, avctx, frame);
+}
+
+wrap(avcodec_receive_packet(AVCodecContext *avctx, AVPacket *avpkt))
+{
+    testneonclobbers(avcodec_receive_packet, avctx, avpkt);
+}
diff --git a/libavcodec/arm/pixblockdsp_armv6.S b/libavcodec/arm/pixblockdsp_armv6.S
index 4c925a4daa..b10ea78e88 100644
--- a/libavcodec/arm/pixblockdsp_armv6.S
+++ b/libavcodec/arm/pixblockdsp_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/pixblockdsp_init_arm.c b/libavcodec/arm/pixblockdsp_init_arm.c
index bb32631df4..59d2b49381 100644
--- a/libavcodec/arm/pixblockdsp_init_arm.c
+++ b/libavcodec/arm/pixblockdsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rdft_init_arm.c b/libavcodec/arm/rdft_init_arm.c
index 2858ba93e8..1c5d8beb61 100644
--- a/libavcodec/arm/rdft_init_arm.c
+++ b/libavcodec/arm/rdft_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rdft_neon.S b/libavcodec/arm/rdft_neon.S
index 7d01d53f1a..781d976354 100644
--- a/libavcodec/arm/rdft_neon.S
+++ b/libavcodec/arm/rdft_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised RDFT
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv34dsp_init_arm.c b/libavcodec/arm/rv34dsp_init_arm.c
index 5ce787ba7f..8bfe90b3d3 100644
--- a/libavcodec/arm/rv34dsp_init_arm.c
+++ b/libavcodec/arm/rv34dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index a29123f772..3d4a83d9ac 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv40dsp_init_arm.c b/libavcodec/arm/rv40dsp_init_arm.c
index df3e4611a1..c24854d1cd 100644
--- a/libavcodec/arm/rv40dsp_init_arm.c
+++ b/libavcodec/arm/rv40dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index 6bd45eb5ad..099f88c092 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/sbrdsp_init_arm.c b/libavcodec/arm/sbrdsp_init_arm.c
index 4da7967b49..4fb69f922b 100644
--- a/libavcodec/arm/sbrdsp_init_arm.c
+++ b/libavcodec/arm/sbrdsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/sbrdsp_neon.S b/libavcodec/arm/sbrdsp_neon.S
index 610397f9e2..e66abd682a 100644
--- a/libavcodec/arm/sbrdsp_neon.S
+++ b/libavcodec/arm/sbrdsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2012 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S
index a651927515..42d79ab95e 100644
--- a/libavcodec/arm/simple_idct_arm.S
+++ b/libavcodec/arm/simple_idct_arm.S
@@ -4,22 +4,22 @@
  * Author: Frederic Boulay <dilb@handhelds.org>
  *
  * The function defined in this file is derived from the simple_idct function
- * from the libavcodec library part of the Libav project.
+ * from the libavcodec library part of the FFmpeg project.
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S
index b19683320a..a8d03469ab 100644
--- a/libavcodec/arm/simple_idct_armv5te.S
+++ b/libavcodec/arm/simple_idct_armv5te.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S
index 60723467a0..79cf5d41fb 100644
--- a/libavcodec/arm/simple_idct_armv6.S
+++ b/libavcodec/arm/simple_idct_armv6.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
index a1cde8d80a..c3e573c00a 100644
--- a/libavcodec/arm/simple_idct_neon.S
+++ b/libavcodec/arm/simple_idct_neon.S
@@ -6,20 +6,20 @@
  * Based on Simple IDCT
  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/startcode.h b/libavcodec/arm/startcode.h
index d7996c1a4b..cf25d9d4df 100644
--- a/libavcodec/arm/startcode.h
+++ b/libavcodec/arm/startcode.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/startcode_armv6.S b/libavcodec/arm/startcode_armv6.S
index 64078b2898..a46f009375 100644
--- a/libavcodec/arm/startcode_armv6.S
+++ b/libavcodec/arm/startcode_armv6.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/synth_filter_init_arm.c
index bf0d9b4b17..ea0ce148d4 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/synth_filter_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -22,20 +22,9 @@
 
 #include "libavutil/arm/cpu.h"
 #include "libavutil/attributes.h"
-#include "libavcodec/dcadsp.h"
-
-void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
-
-void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs);
-void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs);
-
-void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
-                                SynthFilterContext *synth, FFTContext *imdct,
-                                float synth_buf_ptr[512],
-                                int *synth_buf_offset, float synth_buf2[32],
-                                const float window[512], float *samples_out,
-                                float raXin[32], float scale);
+#include "libavutil/internal.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/synth_filter.h"
 
 void ff_synth_filter_float_vfp(FFTContext *imdct,
                                float *synth_buf_ptr, int *synth_buf_offset,
@@ -49,21 +38,6 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
                                 float out[32], const float in[32],
                                 float scale);
 
-av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_vfp_vm(cpu_flags)) {
-        s->lfe_fir[0]      = ff_dca_lfe_fir32_vfp;
-        s->lfe_fir[1]      = ff_dca_lfe_fir64_vfp;
-        s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
-    }
-    if (have_neon(cpu_flags)) {
-        s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
-        s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
-    }
-}
-
 av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
diff --git a/libavcodec/arm/synth_filter_neon.S b/libavcodec/arm/synth_filter_neon.S
index 62bb6674ed..5417be7d53 100644
--- a/libavcodec/arm/synth_filter_neon.S
+++ b/libavcodec/arm/synth_filter_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
index 5d79e509f9..596734c5bc 100644
--- a/libavcodec/arm/synth_filter_vfp.S
+++ b/libavcodec/arm/synth_filter_vfp.S
@@ -2,20 +2,20 @@
  * Copyright (c) 2013 RISC OS Open Ltd
  * Author: Ben Avison <bavison@riscosopen.org>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vc1dsp.h b/libavcodec/arm/vc1dsp.h
index 30f059f28c..cd01ac5384 100644
--- a/libavcodec/arm/vc1dsp.h
+++ b/libavcodec/arm/vc1dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c
index a6a97c8bf9..5f2c759048 100644
--- a/libavcodec/arm/vc1dsp_init_arm.c
+++ b/libavcodec/arm/vc1dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -28,8 +28,10 @@ av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if HAVE_ARMV6
     if (have_setend(cpu_flags))
         dsp->startcode_find_candidate = ff_startcode_find_candidate_armv6;
+#endif
     if (have_neon(cpu_flags))
         ff_vc1dsp_init_neon(dsp);
 }
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index 9ded7a28b9..bb873e687e 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -37,40 +37,38 @@ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, int linesize, int16_t *block);
 void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int rnd);
 
-void ff_put_vc1_mspel_mc10_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc20_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc30_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc01_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc02_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc03_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc11_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc12_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc13_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc21_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc22_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc23_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-
-void ff_put_vc1_mspel_mc31_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc32_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
-void ff_put_vc1_mspel_mc33_neon(uint8_t *dst, const uint8_t *src,
-                                ptrdiff_t stride, int rnd);
+#define DECL_PUT(X, Y) \
+void ff_put_vc1_mspel_mc##X##Y##_neon(uint8_t *dst, const uint8_t *src, \
+                                      ptrdiff_t stride, int rnd); \
+static void ff_put_vc1_mspel_mc##X##Y##_16_neon(uint8_t *dst, const uint8_t *src, \
+                                         ptrdiff_t stride, int rnd) \
+{ \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+  dst += 8*stride; src += 8*stride; \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+0, src+0, stride, rnd); \
+  ff_put_vc1_mspel_mc##X##Y##_neon(dst+8, src+8, stride, rnd); \
+}
+
+DECL_PUT(1, 0)
+DECL_PUT(2, 0)
+DECL_PUT(3, 0)
+
+DECL_PUT(0, 1)
+DECL_PUT(0, 2)
+DECL_PUT(0, 3)
+
+DECL_PUT(1, 1)
+DECL_PUT(1, 2)
+DECL_PUT(1, 3)
+
+DECL_PUT(2, 1)
+DECL_PUT(2, 2)
+DECL_PUT(2, 3)
+
+DECL_PUT(3, 1)
+DECL_PUT(3, 2)
+DECL_PUT(3, 3)
 
 void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride, int h,
                                 int x, int y);
@@ -81,6 +79,10 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, int stride, int h,
                                 int x, int y);
 
+#define FN_ASSIGN(X, Y) \
+    dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+    dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+
 av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
 {
     dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
@@ -92,23 +94,26 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
 
-    dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_neon;
+    dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
     if (HAVE_AS_DN_DIRECTIVE) {
-    dsp->put_vc1_mspel_pixels_tab[ 1] = ff_put_vc1_mspel_mc10_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 2] = ff_put_vc1_mspel_mc20_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 3] = ff_put_vc1_mspel_mc30_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 4] = ff_put_vc1_mspel_mc01_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 5] = ff_put_vc1_mspel_mc11_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 6] = ff_put_vc1_mspel_mc21_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 7] = ff_put_vc1_mspel_mc31_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 8] = ff_put_vc1_mspel_mc02_neon;
-    dsp->put_vc1_mspel_pixels_tab[ 9] = ff_put_vc1_mspel_mc12_neon;
-    dsp->put_vc1_mspel_pixels_tab[10] = ff_put_vc1_mspel_mc22_neon;
-    dsp->put_vc1_mspel_pixels_tab[11] = ff_put_vc1_mspel_mc32_neon;
-    dsp->put_vc1_mspel_pixels_tab[12] = ff_put_vc1_mspel_mc03_neon;
-    dsp->put_vc1_mspel_pixels_tab[13] = ff_put_vc1_mspel_mc13_neon;
-    dsp->put_vc1_mspel_pixels_tab[14] = ff_put_vc1_mspel_mc23_neon;
-    dsp->put_vc1_mspel_pixels_tab[15] = ff_put_vc1_mspel_mc33_neon;
+    FN_ASSIGN(1, 0);
+    FN_ASSIGN(2, 0);
+    FN_ASSIGN(3, 0);
+
+    FN_ASSIGN(0, 1);
+    FN_ASSIGN(1, 1);
+    FN_ASSIGN(2, 1);
+    FN_ASSIGN(3, 1);
+
+    FN_ASSIGN(0, 2);
+    FN_ASSIGN(1, 2);
+    FN_ASSIGN(2, 2);
+    FN_ASSIGN(3, 2);
+
+    FN_ASSIGN(0, 3);
+    FN_ASSIGN(1, 3);
+    FN_ASSIGN(2, 3);
+    FN_ASSIGN(3, 3);
     }
 
     dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index fa87eded61..c4f4db9c8e 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_arm.h b/libavcodec/arm/videodsp_arm.h
index a7087599cc..112cbb86c7 100644
--- a/libavcodec/arm/videodsp_arm.h
+++ b/libavcodec/arm/videodsp_arm.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_armv5te.S b/libavcodec/arm/videodsp_armv5te.S
index 0510019f03..aff1161ada 100644
--- a/libavcodec/arm/videodsp_armv5te.S
+++ b/libavcodec/arm/videodsp_armv5te.S
@@ -2,20 +2,20 @@
 @ ARMv5te-optimized core video DSP functions
 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
 @
-@ This file is part of Libav.
+@ This file is part of FFmpeg
 @
-@ Libav is free software; you can redistribute it and/or
+@ FFmpeg is free software; you can redistribute it and/or
 @ modify it under the terms of the GNU Lesser General Public
 @ License as published by the Free Software Foundation; either
 @ version 2.1 of the License, or (at your option) any later version.
 @
-@ Libav is distributed in the hope that it will be useful,
+@ FFmpeg is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 @ Lesser General Public License for more details.
 @
 @ You should have received a copy of the GNU Lesser General Public
-@ License along with Libav; if not, write to the Free Software
+@ License along with FFmpeg; if not, write to the Free Software
 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 @
 
diff --git a/libavcodec/arm/videodsp_init_arm.c b/libavcodec/arm/videodsp_init_arm.c
index 20c6e4a605..a89abb25d5 100644
--- a/libavcodec/arm/videodsp_init_arm.c
+++ b/libavcodec/arm/videodsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/videodsp_init_armv5te.c b/libavcodec/arm/videodsp_init_armv5te.c
index 832191f6d2..1ea1f3438d 100644
--- a/libavcodec/arm/videodsp_init_armv5te.c
+++ b/libavcodec/arm/videodsp_init_armv5te.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2012 Ronald S. Bultje
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -27,5 +27,7 @@ void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h);
 
 av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc)
 {
+#if HAVE_ARMV5TE_EXTERNAL
     ctx->prefetch = ff_prefetch_arm;
+#endif
 }
diff --git a/libavcodec/arm/vorbisdsp_init_arm.c b/libavcodec/arm/vorbisdsp_init_arm.c
index 853ba2d865..f4b3d80ef6 100644
--- a/libavcodec/arm/vorbisdsp_init_arm.c
+++ b/libavcodec/arm/vorbisdsp_init_arm.c
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vorbisdsp_neon.S b/libavcodec/arm/vorbisdsp_neon.S
index 7df876c2bc..79ce54f938 100644
--- a/libavcodec/arm/vorbisdsp_neon.S
+++ b/libavcodec/arm/vorbisdsp_neon.S
@@ -2,20 +2,20 @@
  * ARM NEON optimised DSP functions
  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp3dsp_init_arm.c b/libavcodec/arm/vp3dsp_init_arm.c
index 1c914343d3..65ea53fe0f 100644
--- a/libavcodec/arm/vp3dsp_init_arm.c
+++ b/libavcodec/arm/vp3dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
index 58bd97d548..2942d488f5 100644
--- a/libavcodec/arm/vp3dsp_neon.S
+++ b/libavcodec/arm/vp3dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2009 David Conrad
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h
index 6bc9456336..feb1247916 100644
--- a/libavcodec/arm/vp56_arith.h
+++ b/libavcodec/arm/vp56_arith.h
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp6dsp_init_arm.c b/libavcodec/arm/vp6dsp_init_arm.c
index 7e2615047b..a59d61278c 100644
--- a/libavcodec/arm/vp6dsp_init_arm.c
+++ b/libavcodec/arm/vp6dsp_init_arm.c
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp6dsp_neon.S b/libavcodec/arm/vp6dsp_neon.S
index 10b4d0f14c..03dd28d1cb 100644
--- a/libavcodec/arm/vp6dsp_neon.S
+++ b/libavcodec/arm/vp6dsp_neon.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8.h b/libavcodec/arm/vp8.h
index 93b2788835..965342d93b 100644
--- a/libavcodec/arm/vp8.h
+++ b/libavcodec/arm/vp8.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S
index 3863dc31a5..e7d25a45c1 100644
--- a/libavcodec/arm/vp8_armv6.S
+++ b/libavcodec/arm/vp8_armv6.S
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2010 Mans Rullgard
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp.h b/libavcodec/arm/vp8dsp.h
index 0d55e0ffc0..7281d0bfb1 100644
--- a/libavcodec/arm/vp8dsp.h
+++ b/libavcodec/arm/vp8dsp.h
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_armv6.S b/libavcodec/arm/vp8dsp_armv6.S
index 9eb9734cd3..2320bf4d23 100644
--- a/libavcodec/arm/vp8dsp_armv6.S
+++ b/libavcodec/arm/vp8dsp_armv6.S
@@ -5,20 +5,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * This code was partially ported from libvpx, which uses this license:
diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c
index aa77dbab98..8b801766d7 100644
--- a/libavcodec/arm/vp8dsp_init_arm.c
+++ b/libavcodec/arm/vp8dsp_init_arm.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_init_armv6.c b/libavcodec/arm/vp8dsp_init_armv6.c
index febe4e71a2..a5bcd733e0 100644
--- a/libavcodec/arm/vp8dsp_init_armv6.c
+++ b/libavcodec/arm/vp8dsp_init_armv6.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_init_neon.c b/libavcodec/arm/vp8dsp_init_neon.c
index 2b6c7750d3..53f1f23380 100644
--- a/libavcodec/arm/vp8dsp_init_neon.c
+++ b/libavcodec/arm/vp8dsp_init_neon.c
@@ -1,18 +1,18 @@
 /*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
index f43b4f7060..fcb424881b 100644
--- a/libavcodec/arm/vp8dsp_neon.S
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -4,20 +4,20 @@
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
diff --git a/libavcodec/arm/vp9dsp_init.h b/libavcodec/arm/vp9dsp_init.h
new file mode 100644
index 0000000000..0dc1c2dc20
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_VP9DSP_INIT_H
+#define AVCODEC_ARM_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+void ff_vp9dsp_init_10bpp_arm(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_arm(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_ARM_VP9DSP_INIT_H */
diff --git a/libavcodec/arm/vp9dsp_init_10bpp_arm.c b/libavcodec/arm/vp9dsp_init_10bpp_arm.c
new file mode 100644
index 0000000000..b8cb293b20
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_10bpp_arm.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_arm
+#include "vp9dsp_init_16bpp_arm_template.c"
diff --git a/libavcodec/arm/vp9dsp_init_12bpp_arm.c b/libavcodec/arm/vp9dsp_init_12bpp_arm.c
new file mode 100644
index 0000000000..fa65eb260b
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_12bpp_arm.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_arm
+#include "vp9dsp_init_16bpp_arm_template.c"
diff --git a/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c b/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c
new file mode 100644
index 0000000000..3620535065
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_16bpp_arm_template.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz, suffix)                                          \
+void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                      const uint8_t *src, ptrdiff_t src_stride, \
+                                      int h, int mx, int my)
+
+#define decl_mc_func(op, filter, dir, sz, bpp)                                                   \
+void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz, bpp)                                      \
+static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                const uint8_t *src,                 \
+                                                ptrdiff_t src_stride,               \
+                                                int h, int mx, int my)              \
+{                                                                                   \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]);         \
+    /* We only need h + 7 lines, but the horizontal filter assumes an               \
+     * even number of rows, so filter h + 8 lines here. */                          \
+    ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz,                          \
+                                             src - 3 * src_stride, src_stride,      \
+                                             h + 8, mx, 0);                         \
+    ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride,                    \
+                                                temp + 3 * 2 * sz, 2 * sz,          \
+                                                h, 0, my);                          \
+}
+
+#define decl_filter_funcs(op, dir, sz, bpp)  \
+    decl_mc_func(op, regular, dir, sz, bpp); \
+    decl_mc_func(op, sharp,   dir, sz, bpp); \
+    decl_mc_func(op, smooth,  dir, sz, bpp)
+
+#define decl_mc_funcs(sz, bpp)      \
+    decl_filter_funcs(put, h,  sz, bpp); \
+    decl_filter_funcs(avg, h,  sz, bpp); \
+    decl_filter_funcs(put, v,  sz, bpp); \
+    decl_filter_funcs(avg, v,  sz, bpp); \
+    decl_filter_funcs(put, hv, sz, bpp); \
+    decl_filter_funcs(avg, hv, sz, bpp)
+
+declare_fpel(copy, 128, );
+declare_fpel(copy, 64,  );
+declare_fpel(copy, 32,  );
+declare_fpel(copy, 16,  );
+declare_fpel(copy, 8,   );
+declare_fpel(avg, 64, _16);
+declare_fpel(avg, 32, _16);
+declare_fpel(avg, 16, _16);
+declare_fpel(avg, 8,  _16);
+declare_fpel(avg, 4,  _16);
+
+decl_mc_funcs(64, BPP);
+decl_mc_funcs(32, BPP);
+decl_mc_funcs(16, BPP);
+decl_mc_funcs(8,  BPP);
+decl_mc_funcs(4,  BPP);
+
+#define define_8tap_2d_funcs(sz, bpp)        \
+    define_8tap_2d_fn(put, regular, sz, bpp) \
+    define_8tap_2d_fn(put, sharp,   sz, bpp) \
+    define_8tap_2d_fn(put, smooth,  sz, bpp) \
+    define_8tap_2d_fn(avg, regular, sz, bpp) \
+    define_8tap_2d_fn(avg, sharp,   sz, bpp) \
+    define_8tap_2d_fn(avg, smooth,  sz, bpp)
+
+define_8tap_2d_funcs(64, BPP)
+define_8tap_2d_funcs(32, BPP)
+define_8tap_2d_funcs(16, BPP)
+define_8tap_2d_funcs(8,  BPP)
+define_8tap_2d_funcs(4,  BPP)
+
+
+static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_fpel(idx1, idx2, sz, type, suffix)      \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix##_neon
+
+#define init_copy_avg(idx, sz1, sz2) \
+    init_fpel(idx, 0, sz2, copy, );  \
+    init_fpel(idx, 1, sz1, avg, _16)
+
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
+    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp)                                   \
+    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp)
+
+#define init_mc_funcs_dirs(idx, sz, bpp)            \
+    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, hv, 1, 1, sz,        , bpp)
+
+        init_copy_avg(0, 64, 128);
+        init_copy_avg(1, 32, 64);
+        init_copy_avg(2, 16, 32);
+        init_copy_avg(3, 8,  16);
+        init_copy_avg(4, 4,  8);
+
+        init_mc_funcs_dirs(0, 64, BPP);
+        init_mc_funcs_dirs(1, 32, BPP);
+        init_mc_funcs_dirs(2, 16, BPP);
+        init_mc_funcs_dirs(3, 8,  BPP);
+        init_mc_funcs_dirs(4, 4,  BPP);
+    }
+}
+
+#define define_itxfm2(type_a, type_b, sz, bpp)                                     \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst,    \
+                                                                 ptrdiff_t stride, \
+                                                                 int16_t *_block, int eob)
+#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
+
+#define define_itxfm_funcs(sz, bpp)      \
+    define_itxfm(idct,  idct,  sz, bpp); \
+    define_itxfm(iadst, idct,  sz, bpp); \
+    define_itxfm(idct,  iadst, sz, bpp); \
+    define_itxfm(iadst, iadst, sz, bpp)
+
+define_itxfm_funcs(4,  BPP);
+define_itxfm_funcs(8,  BPP);
+define_itxfm_funcs(16, BPP);
+define_itxfm(idct, idct, 32, BPP);
+define_itxfm(iwht, iwht, 4,  BPP);
+
+
+static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_itxfm2(tx, sz, bpp)                                               \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_##bpp##_neon;  \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
+#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
+
+#define init_idct2(tx, nm, bpp)     \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
+#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
+
+        init_itxfm(TX_4X4,   4x4,   BPP);
+        init_itxfm(TX_8X8,   8x8,   BPP);
+        init_itxfm(TX_16X16, 16x16, BPP);
+        init_idct(TX_32X32, idct_idct_32x32, BPP);
+        init_idct(4,        iwht_iwht_4x4,   BPP);
+    }
+}
+
+#define define_loop_filter(dir, wd, size, bpp) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size, bpp) \
+    define_loop_filter(h, wd, size, bpp);  \
+    define_loop_filter(v, wd, size, bpp)
+
+define_loop_filters(4,  8,  BPP);
+define_loop_filters(8,  8,  BPP);
+define_loop_filters(16, 8,  BPP);
+
+define_loop_filters(16, 16, BPP);
+
+define_loop_filters(44, 16, BPP);
+define_loop_filters(48, 16, BPP);
+define_loop_filters(84, 16, BPP);
+define_loop_filters(88, 16, BPP);
+
+static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
+    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
+
+#define init_lpf_func_16(idx, dir, bpp) \
+    dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
+
+#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
+    dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
+
+#define init_lpf_funcs_8_wd(idx, wd, bpp) \
+    init_lpf_func_8(idx, 0, h, wd, bpp);  \
+    init_lpf_func_8(idx, 1, v, wd, bpp)
+
+#define init_lpf_funcs_16(bpp)   \
+    init_lpf_func_16(0, h, bpp); \
+    init_lpf_func_16(1, v, bpp)
+
+#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
+    init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp);  \
+    init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
+
+#define init_lpf_funcs_8(bpp)        \
+    init_lpf_funcs_8_wd(0, 4,  bpp); \
+    init_lpf_funcs_8_wd(1, 8,  bpp); \
+    init_lpf_funcs_8_wd(2, 16, bpp)
+
+#define init_lpf_funcs_mix2(bpp)           \
+    init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
+    init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
+    init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
+    init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
+
+        init_lpf_funcs_8(BPP);
+        init_lpf_funcs_16(BPP);
+        init_lpf_funcs_mix2(BPP);
+    }
+}
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+    vp9dsp_mc_init_arm(dsp);
+    vp9dsp_loopfilter_init_arm(dsp);
+    vp9dsp_itxfm_init_arm(dsp);
+}
diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
new file mode 100644
index 0000000000..4c57fd6ba0
--- /dev/null
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz)                                          \
+void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                              const uint8_t *src, ptrdiff_t src_stride, \
+                              int h, int mx, int my)
+
+#define declare_copy_avg(sz) \
+    declare_fpel(copy, sz);  \
+    declare_fpel(avg , sz)
+
+#define decl_mc_func(op, filter, dir, sz)                                                \
+void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                               const uint8_t *src, ptrdiff_t src_stride, \
+                                               int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz)                                         \
+static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                        const uint8_t *src, ptrdiff_t src_stride, \
+                                        int h, int mx, int my)                    \
+{                                                                                 \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]);           \
+    /* We only need h + 7 lines, but the horizontal filter assumes an             \
+     * even number of rows, so filter h + 8 lines here. */                        \
+    ff_vp9_put_##filter##sz##_h_neon(temp, sz,                                    \
+                                     src - 3 * src_stride, src_stride,            \
+                                     h + 8, mx, 0);                               \
+    ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride,                          \
+                                        temp + 3 * sz, sz,                        \
+                                        h, 0, my);                                \
+}
+
+#define decl_filter_funcs(op, dir, sz)  \
+    decl_mc_func(op, regular, dir, sz); \
+    decl_mc_func(op, sharp,   dir, sz); \
+    decl_mc_func(op, smooth,  dir, sz)
+
+#define decl_mc_funcs(sz)           \
+    decl_filter_funcs(put, h,  sz); \
+    decl_filter_funcs(avg, h,  sz); \
+    decl_filter_funcs(put, v,  sz); \
+    decl_filter_funcs(avg, v,  sz); \
+    decl_filter_funcs(put, hv, sz); \
+    decl_filter_funcs(avg, hv, sz)
+
+declare_copy_avg(64);
+declare_copy_avg(32);
+declare_copy_avg(16);
+declare_copy_avg(8);
+declare_copy_avg(4);
+
+decl_mc_funcs(64);
+decl_mc_funcs(32);
+decl_mc_funcs(16);
+decl_mc_funcs(8);
+decl_mc_funcs(4);
+
+#define define_8tap_2d_funcs(sz)        \
+    define_8tap_2d_fn(put, regular, sz) \
+    define_8tap_2d_fn(put, sharp,   sz) \
+    define_8tap_2d_fn(put, smooth,  sz) \
+    define_8tap_2d_fn(avg, regular, sz) \
+    define_8tap_2d_fn(avg, sharp,   sz) \
+    define_8tap_2d_fn(avg, smooth,  sz)
+
+define_8tap_2d_funcs(64)
+define_8tap_2d_funcs(32)
+define_8tap_2d_funcs(16)
+define_8tap_2d_funcs(8)
+define_8tap_2d_funcs(4)
+
+
+static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_fpel(idx1, idx2, sz, type)              \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##_neon
+
+#define init_copy_avg(idx, sz)   \
+    init_fpel(idx, 0, sz, copy); \
+    init_fpel(idx, 1, sz, avg)
+
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
+    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx)                                   \
+    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx)
+
+#define init_mc_funcs_dirs(idx, sz)            \
+    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_); \
+    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_); \
+    init_mc_funcs(idx, hv, 1, 1, sz,)
+
+        init_copy_avg(0, 64);
+        init_copy_avg(1, 32);
+        init_copy_avg(2, 16);
+        init_copy_avg(3, 8);
+        init_copy_avg(4, 4);
+
+        init_mc_funcs_dirs(0, 64);
+        init_mc_funcs_dirs(1, 32);
+        init_mc_funcs_dirs(2, 16);
+        init_mc_funcs_dirs(3, 8);
+        init_mc_funcs_dirs(4, 4);
+    }
+}
+
+#define define_itxfm(type_a, type_b, sz)                                   \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst,    \
+                                                         ptrdiff_t stride, \
+                                                         int16_t *_block, int eob)
+
+#define define_itxfm_funcs(sz)      \
+    define_itxfm(idct,  idct,  sz); \
+    define_itxfm(iadst, idct,  sz); \
+    define_itxfm(idct,  iadst, sz); \
+    define_itxfm(iadst, iadst, sz)
+
+define_itxfm_funcs(4);
+define_itxfm_funcs(8);
+define_itxfm_funcs(16);
+define_itxfm(idct, idct, 32);
+define_itxfm(iwht, iwht, 4);
+
+
+static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_itxfm(tx, sz)                                             \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_neon;  \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_neon; \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_neon; \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
+
+#define init_idct(tx, nm)           \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
+
+        init_itxfm(TX_4X4, 4x4);
+        init_itxfm(TX_8X8, 8x8);
+        init_itxfm(TX_16X16, 16x16);
+        init_idct(TX_32X32, idct_idct_32x32);
+        init_idct(4, iwht_iwht_4x4);
+    }
+}
+
+#define define_loop_filter(dir, wd, size) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size) \
+    define_loop_filter(h, wd, size);  \
+    define_loop_filter(v, wd, size)
+
+define_loop_filters(4, 8);
+define_loop_filters(8, 8);
+define_loop_filters(16, 8);
+define_loop_filters(16, 16);
+
+define_loop_filters(44, 16);
+
+#define lf_mix_fn(dir, wd1, wd2, stridea)                                                         \
+static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst,                                \
+                                                     ptrdiff_t stride,                            \
+                                                     int E, int I, int H)                         \
+{                                                                                                 \
+    ff_vp9_loop_filter_##dir##_##wd1##_8_neon(dst, stride, E & 0xff, I & 0xff, H & 0xff);         \
+    ff_vp9_loop_filter_##dir##_##wd2##_8_neon(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
+}
+
+#define lf_mix_fns(wd1, wd2)       \
+    lf_mix_fn(h, wd1, wd2, stride) \
+    lf_mix_fn(v, wd1, wd2, sizeof(uint8_t))
+
+lf_mix_fns(4, 8)
+lf_mix_fns(8, 4)
+lf_mix_fns(8, 8)
+
+static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
+        dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
+        dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
+        dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
+        dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
+        dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
+
+        dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
+        dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
+
+        dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
+        dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
+        dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon;
+        dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon;
+        dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon;
+        dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_neon;
+        dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_neon;
+        dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_neon;
+    }
+}
+
+av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_arm(dsp);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_arm(dsp);
+        return;
+    } else if (bpp != 8)
+        return;
+
+    vp9dsp_mc_init_arm(dsp);
+    vp9dsp_loopfilter_init_arm(dsp);
+    vp9dsp_itxfm_init_arm(dsp);
+}
diff --git a/libavcodec/arm/vp9itxfm_16bpp_neon.S b/libavcodec/arm/vp9itxfm_16bpp_neon.S
new file mode 100644
index 0000000000..b4f615ebb8
--- /dev/null
+++ b/libavcodec/arm/vp9itxfm_16bpp_neon.S
@@ -0,0 +1,1945 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+        .short  11585, 0, 6270, 15137
+iadst4_coeffs:
+        .short  5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+@ Do two 4x4 transposes, using q registers for the subtransposes that don't
+@ need to address the individual d registers.
+@ r0,r1 == rq1, r2,r3 == rq1, etc
+.macro transpose32_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        vswp             \r1,  \r4  @ vtrn.64 \rq0, \rq2
+        vswp             \r3,  \r6  @ vtrn.64 \rq1, \rq3
+        vswp             \r9,  \r12 @ vtrn.64 \rq4, \rq6
+        vswp             \r11, \r14 @ vtrn.64 \rq5, \rq7
+        vtrn.32          \rq0, \rq1
+        vtrn.32          \rq2, \rq3
+        vtrn.32          \rq4, \rq5
+        vtrn.32          \rq6, \rq7
+.endm
+
+@ Do eight 2x2 transposes.
+.macro transpose32_8x_2x2 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        vtrn.32          \r0,  \r1
+        vtrn.32          \r2,  \r3
+        vtrn.32          \r4,  \r5
+        vtrn.32          \r6,  \r7
+        vtrn.32          \r8,  \r9
+        vtrn.32          \r10, \r11
+        vtrn.32          \r12, \r13
+        vtrn.32          \r14, \r15
+.endm
+
+@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ in/out are d registers
+.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
+        vadd.s32        \tmpd1, \in1,  \in2
+        vsub.s32        \tmpd2, \in1,  \in2
+.if \neg > 0
+        vneg.s32        \tmpd1, \tmpd1
+.endif
+        vmull.s32       \tmpq3, \tmpd1, d0[0]
+        vmull.s32       \tmpq4, \tmpd2, d0[0]
+        vrshrn.s64      \out1, \tmpq3, #14
+        vrshrn.s64      \out2, \tmpq4, #14
+.endm
+
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+        vmull.s32       \tmpq3, \in1, d0[0]
+        vrshrn.s64      \out1, \tmpq3, #14
+        vrshrn.s64      \out2, \tmpq3, #14
+.endm
+
+@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ Same as mbutterfly0, but with input being 2 q registers, output
+@ being 4 d registers.
+@ This can do with either 4 or 6 temporary q registers.
+.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
+        vadd.s32        \tmpq1, \in1,  \in2
+        vsub.s32        \tmpq2, \in1,  \in2
+        vmull.s32       \tmpq3, \tmpd11, d0[0]
+        vmull.s32       \tmpq4, \tmpd12, d0[0]
+.ifb \tmpq5
+        vrshrn.s64      \out1, \tmpq3, #14
+        vrshrn.s64      \out2, \tmpq4, #14
+        vmull.s32       \tmpq3, \tmpd21, d0[0]
+        vmull.s32       \tmpq4, \tmpd22, d0[0]
+        vrshrn.s64      \out3, \tmpq3, #14
+        vrshrn.s64      \out4, \tmpq4, #14
+.else
+        vmull.s32       \tmpq5, \tmpd21, d0[0]
+        vmull.s32       \tmpq6, \tmpd22, d0[0]
+        vrshrn.s64      \out1, \tmpq3, #14
+        vrshrn.s64      \out2, \tmpq4, #14
+        vrshrn.s64      \out3, \tmpq5, #14
+        vrshrn.s64      \out4, \tmpq6, #14
+.endif
+.endm
+
+@ out1 = in1 * coef1 - in2 * coef2
+@ out2 = in1 * coef2 + in2 * coef1
+@ out are 2 q registers, in are 2 d registers
+.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2, neg=0
+        vmull.s32       \out1, \in1, \coef1
+        vmlsl.s32       \out1, \in2, \coef2
+.if \neg
+        vmov.s64        \out2, #0
+        vmlsl.s32       \out2, \in1, \coef2
+        vmlsl.s32       \out2, \in2, \coef1
+.else
+        vmull.s32       \out2, \in1, \coef2
+        vmlal.s32       \out2, \in2, \coef1
+.endif
+.endm
+
+@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
+@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
+@ out are 4 q registers, in are 4 d registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
+        vmull.s32       \out1, \in1, \coef1
+        vmull.s32       \out2, \in2, \coef1
+        vmull.s32       \out3, \in1, \coef2
+        vmull.s32       \out4, \in2, \coef2
+        vmlsl.s32       \out1, \in3, \coef2
+        vmlsl.s32       \out2, \in4, \coef2
+        vmlal.s32       \out3, \in3, \coef1
+        vmlal.s32       \out4, \in4, \coef1
+.endm
+
+@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+@ inout are 2 d registers, tmp are 2 q registers
+.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0
+        mbutterfly_l    \tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2, \neg
+        vrshrn.s64      \inout1, \tmp1,  #14
+        vrshrn.s64      \inout2, \tmp2,  #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout2 as zero
+.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
+        vmull.s32       \tmp1,   \inout1, \coef1
+        vmull.s32       \tmp2,   \inout1, \coef2
+        vrshrn.s64      \inout1, \tmp1,   #14
+        vrshrn.s64      \inout2, \tmp2,   #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout1 as zero
+.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
+        vmov.s64        \tmp1,   #0
+        vmull.s32       \tmp2,   \inout2, \coef1
+        vmlsl.s32       \tmp1,   \inout2, \coef2
+        vrshrn.s64      \inout2, \tmp2,   #14
+        vrshrn.s64      \inout1, \tmp1,   #14
+.endm
+
+@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
+@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
+@ inout are 4 d registers, tmp are 4 q registers
+.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
+        vrshrn.s64      \inout1, \tmp1,  #14
+        vrshrn.s64      \inout2, \tmp2,  #14
+        vrshrn.s64      \inout3, \tmp3,  #14
+        vrshrn.s64      \inout4, \tmp4,  #14
+.endm
+
+@ out1 = in1 + in2
+@ out2 = in1 - in2
+.macro butterfly out1, out2, in1, in2
+        vadd.s32        \out1, \in1, \in2
+        vsub.s32        \out2, \in1, \in2
+.endm
+
+@ out1 = in1 - in2
+@ out2 = in1 + in2
+.macro butterfly_r out1, out2, in1, in2
+        vsub.s32        \out1, \in1, \in2
+        vadd.s32        \out2, \in1, \in2
+.endm
+
+@ out1 = (in1 + in2 + (1 << 13)) >> 14
+@ out2 = (in1 - in2 + (1 << 13)) >> 14
+@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
+.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
+        vadd.s64        \tmp1, \in1, \in2
+        vsub.s64        \tmp2, \in1, \in2
+        vrshrn.s64      \out1, \tmp1,  #14
+        vrshrn.s64      \out2, \tmp2,  #14
+.endm
+
+@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
+.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+        vadd.s64        \tmp1, \in1, \in3
+        vadd.s64        \tmp2, \in2, \in4
+        vsub.s64        \tmp3, \in1, \in3
+        vsub.s64        \tmp4, \in2, \in4
+        vrshrn.s64      \out1, \tmp1,  #14
+        vrshrn.s64      \out2, \tmp2,  #14
+        vrshrn.s64      \out3, \tmp3,  #14
+        vrshrn.s64      \out4, \tmp4,  #14
+.endm
+
+
+.macro iwht4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        vadd.i32        \c0,  \c0,  \c1
+        vsub.i32        q11,  \c2,  \c3
+        vsub.i32        q10,  \c0,  q11
+        vshr.s32        q10,  q10,  #1
+        vsub.i32        \c2,  q10,  \c1
+        vsub.i32        \c1,  q10,  \c3
+        vadd.i32        \c3,  q11,  \c2
+        vsub.i32        \c0,  \c0,  \c1
+.endm
+
+.macro iwht4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        iwht4_10        \c0, \c1, \c2, \c3, \cd0, \cd1, \cd2, \cd3, \cd4, \cd5, \cd6, \cd7
+.endm
+
+@ c0 == cd0,cd1, c1 == cd2,cd3
+.macro idct4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        vmul.s32        q13,  \c1,  d1[1]
+        vmul.s32        q11,  \c1,  d1[0]
+        vadd.i32        q14,  \c0,  \c2
+        vsub.i32        q15,  \c0,  \c2
+        vmla.s32        q13,  \c3,  d1[0]
+        vmul.s32        q12,  q14,  d0[0]
+        vmul.s32        q10,  q15,  d0[0]
+        vmls.s32        q11,  \c3,  d1[1]
+        vrshr.s32       q13,  q13,  #14
+        vrshr.s32       q12,  q12,  #14
+        vrshr.s32       q10,  q10,  #14
+        vrshr.s32       q11,  q11,  #14
+        vadd.i32        \c0,  q12,  q13
+        vsub.i32        \c3,  q12,  q13
+        vadd.i32        \c1,  q10,  q11
+        vsub.i32        \c2,  q10,  q11
+.endm
+
+.macro idct4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        vmull.s32       q13,  \cd2, d1[1]
+        vmull.s32       q15,  \cd3, d1[1]
+        vmull.s32       q11,  \cd2, d1[0]
+        vmull.s32       q3,   \cd3, d1[0]
+        vadd.i32        q14,  \c0,  \c2
+        vsub.i32        q2,   \c0,  \c2
+        vmlal.s32       q13,  \cd6, d1[0]
+        vmlal.s32       q15,  \cd7, d1[0]
+        vmull.s32       q12,  d28,  d0[0]
+        vmull.s32       q14,  d29,  d0[0]
+        vmull.s32       q10,  d4,   d0[0]
+        vmull.s32       q8,   d5,   d0[0]
+        vmlsl.s32       q11,  \cd6, d1[1]
+        vmlsl.s32       q3,   \cd7, d1[1]
+        vrshrn.s64      d26,  q13,  #14
+        vrshrn.s64      d27,  q15,  #14
+        vrshrn.s64      d24,  q12,  #14
+        vrshrn.s64      d25,  q14,  #14
+        vrshrn.s64      d20,  q10,  #14
+        vrshrn.s64      d21,  q8,   #14
+        vrshrn.s64      d22,  q11,  #14
+        vrshrn.s64      d23,  q3,   #14
+        vadd.i32        \c0,  q12,  q13
+        vsub.i32        \c3,  q12,  q13
+        vadd.i32        \c1,  q10,  q11
+        vsub.i32        \c2,  q10,  q11
+.endm
+
+.macro iadst4_10 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        vmul.s32        q10,  \c0,  d2[0]
+        vmla.s32        q10,  \c2,  d2[1]
+        vmla.s32        q10,  \c3,  d3[0]
+        vmul.s32        q11,  \c0,  d3[0]
+        vmls.s32        q11,  \c2,  d2[0]
+        vsub.s32        \c0,  \c0,  \c2
+        vmls.s32        q11,  \c3,  d2[1]
+        vadd.s32        \c0,  \c0,  \c3
+        vmul.s32        q13,  \c1,  d3[1]
+        vmul.s32        q12,  \c0,  d3[1]
+        vadd.s32        q14,  q10,  q13
+        vadd.s32        q15,  q11,  q13
+        vrshr.s32       \c0,  q14,  #14
+        vadd.s32        q10,  q10,  q11
+        vrshr.s32       \c1,  q15,  #14
+        vsub.s32        q10,  q10,  q13
+        vrshr.s32       \c2,  q12,  #14
+        vrshr.s32       \c3,  q10,  #14
+.endm
+
+.macro iadst4_12 c0, c1, c2, c3, cd0, cd1, cd2, cd3, cd4, cd5, cd6, cd7
+        vmull.s32       q10,  \cd0, d2[0]
+        vmull.s32       q4,   \cd1, d2[0]
+        vmlal.s32       q10,  \cd4, d2[1]
+        vmlal.s32       q4,   \cd5, d2[1]
+        vmlal.s32       q10,  \cd6, d3[0]
+        vmlal.s32       q4,   \cd7, d3[0]
+        vmull.s32       q11,  \cd0, d3[0]
+        vmull.s32       q5,   \cd1, d3[0]
+        vmlsl.s32       q11,  \cd4, d2[0]
+        vmlsl.s32       q5,   \cd5, d2[0]
+        vsub.s32        \c0,  \c0,  \c2
+        vmlsl.s32       q11,  \cd6, d2[1]
+        vmlsl.s32       q5,   \cd7, d2[1]
+        vadd.s32        \c0,  \c0,  \c3
+        vmull.s32       q13,  \cd2, d3[1]
+        vmull.s32       q6,   \cd3, d3[1]
+        vmull.s32       q12,  \cd0, d3[1]
+        vmull.s32       q7,   \cd1, d3[1]
+        vadd.s64        q14,  q10,  q13
+        vadd.s64        q2,   q4,   q6
+        vadd.s64        q15,  q11,  q13
+        vadd.s64        q3,   q5,   q6
+        vrshrn.s64      \cd1, q2,   #14
+        vrshrn.s64      \cd0, q14,  #14
+        vadd.s64        q10,  q10,  q11
+        vadd.s64        q4,   q4,   q5
+        vrshrn.s64      \cd3, q3,   #14
+        vrshrn.s64      \cd2, q15,  #14
+        vsub.s64        q10,  q10,  q13
+        vsub.s64        q4,   q4,   q6
+        vrshrn.s64      \cd4, q12,  #14
+        vrshrn.s64      \cd5, q7,   #14
+        vrshrn.s64      \cd6, q10,  #14
+        vrshrn.s64      \cd7, q4,   #14
+.endm
+
+@ The public functions in this file have got the following signature:
+@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2, bpp
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+        movrel          r12, itxfm4_coeffs
+        vld1.16         {d0}, [r12,:64]
+        vmovl.s16       q0,  d0
+.endif
+.ifc \txfm1,iadst
+        movrel          r12, iadst4_coeffs
+        vld1.16         {d1}, [r12,:64]
+        vmovl.s16       q1,  d1
+.endif
+.else
+        movrel          r12, itxfm4_coeffs
+        vld1.16         {q0}, [r12,:128]
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+.endif
+.if \bpp > 10
+.ifnc \txfm1\()_\txfm2,idct_idct
+        @ iadst4_12 needs q4-q7
+        vpush           {q4-q7}
+.endif
+.endif
+
+        vmov.i32        q14, #0
+        vmov.i32        q15, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #1
+        bne             1f
+        @ DC-only for idct/idct
+        vld1.32         {d4[]},   [r2,:32]
+        vmull.s32       q2,  d4,  d0[0]
+        vrshrn.s64      d4,  q2,  #14
+        vmull.s32       q2,  d4,  d0[0]
+        vrshrn.s64      d4,  q2,  #14
+        vst1.32         {d30[0]}, [r2,:32]
+        vdup.32         q2,  d4[0]
+        vmov            q3,  q2
+        vmov            q8,  q2
+        vmov            q9,  q2
+        b               2f
+.endif
+
+1:
+        vld1.32         {q2-q3},   [r2,:128]
+        vst1.32         {q14-q15}, [r2,:128]!
+        vld1.32         {q8-q9},   [r2,:128]
+
+.ifc \txfm1,iwht
+        vshr.s32        q2,  q2,  #2
+        vshr.s32        q3,  q3,  #2
+        vshr.s32        q8,  q8,  #2
+        vshr.s32        q9,  q9,  #2
+.endif
+
+        vst1.16         {q14-q15}, [r2,:128]!
+        \txfm1\()4_\bpp q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
+
+        @ Transpose 4x4 with 32 bit elements
+        vtrn.32         q2,  q3
+        vtrn.32         q8,  q9
+        vswp            d5,  d16
+        vswp            d7,  d18
+
+        \txfm2\()4_\bpp q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
+2:
+        vmvn.u16        q15, #((0xffff << \bpp) & 0xffff)
+        vld1.16         {d0},  [r0,:64], r1
+        vld1.16         {d1},  [r0,:64], r1
+.ifnc \txfm1,iwht
+        vrshr.s32       q2,  q2,  #4
+        vrshr.s32       q3,  q3,  #4
+        vrshr.s32       q8,  q8,  #4
+        vrshr.s32       q9,  q9,  #4
+.endif
+        vaddw.u16       q2,  q2,  d0
+        vaddw.u16       q3,  q3,  d1
+        vld1.16         {d2},  [r0,:64], r1
+        vld1.16         {d3},  [r0,:64], r1
+        vqmovun.s32     d0,  q2
+        vqmovun.s32     d1,  q3
+        sub             r0,  r0,  r1, lsl #2
+
+        vaddw.u16       q8,  q8,  d2
+        vmin.u16        q0,  q0,  q15
+        vaddw.u16       q9,  q9,  d3
+        vst1.16         {d0},  [r0,:64], r1
+        vqmovun.s32     d2,  q8
+        vqmovun.s32     d3,  q9
+        vmin.u16        q1,  q1,  q15
+
+        vst1.16         {d1},  [r0,:64], r1
+        vst1.16         {d2},  [r0,:64], r1
+        vst1.16         {d3},  [r0,:64], r1
+
+.if \bpp > 10
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpop            {q4-q7}
+.endif
+.endif
+        bx              lr
+endfunc
+.endm
+
+.macro itxfm_funcs4x4 bpp
+itxfm_func4x4 idct,  idct,  \bpp
+itxfm_func4x4 iadst, idct,  \bpp
+itxfm_func4x4 idct,  iadst, \bpp
+itxfm_func4x4 iadst, iadst, \bpp
+itxfm_func4x4 iwht,  iwht,  \bpp
+.endm
+
+itxfm_funcs4x4 10
+itxfm_funcs4x4 12
+
+.macro idct8
+        dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
+        dmbutterfly     d20, d21, d28, d29, d1[0], d1[1], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
+        dmbutterfly     d18, d19, d30, d31, d2[0], d2[1], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
+        dmbutterfly     d26, d27, d22, d23, d3[0], d3[1], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
+
+        butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
+        butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
+        butterfly       q4,  q13, q9,  q13 @ q4 = t4, q13 = t5a
+        butterfly       q5,  q11, q15, q11 @ q5 = t7, q11 = t6a
+
+        butterfly       q8,  q15, q2,  q5  @ q8 = out[0], q15 = out[7]
+
+        dmbutterfly0    d4,  d5,  d10, d11, q11, q13, q9,  q13, d18, d19, d26, d27, q2,  q5, q11, q12 @ q2 = t6, q5 = t5
+
+        butterfly       q11, q12, q14, q4  @ q11 = out[3], q12 = out[4]
+        butterfly       q9,  q14, q3,  q2  @ q9 = out[1],  q14 = out[6]
+        butterfly_r     q13, q10, q10, q5  @ q13 = out[5], q10 = out[2]
+.endm
+
+.macro iadst8
+        movrel          r12, iadst8_coeffs
+        vld1.16         {q1}, [r12,:128]!
+        vmovl.s16       q0,  d2
+        vmovl.s16       q1,  d3
+
+        dmbutterfly_l   q4,  q5,  q2,  q3,  d30, d31, d16, d17, d0[1], d0[0] @ q4,q5  = t1a, q2,q3 = t0a
+        dmbutterfly_l   q8,  q15, q6,  q7,  d22, d23, d24, d25, d2[1], d2[0] @ q8,q15 = t5a, q6,q7 = t4a
+
+        dbutterfly_n    d22, d23, d4,  d5,  q2,  q3,  q6,  q7,  q11, q12, q2,  q3 @ q11 = t0, q2 = t4
+
+        dbutterfly_n    d24, d25, d6,  d7,  q4,  q5,  q8,  q15, q12, q3,  q6,  q7 @ q12 = t1, q3 = t5
+
+        dmbutterfly_l   q6,  q7,  q4,  q5,  d26, d27, d20, d21, d1[1], d1[0] @ q6,q7 = t3a, q4,q5 = t2a
+        dmbutterfly_l   q10, q13, q8,  q15, d18, d19, d28, d29, d3[1], d3[0] @ q10,q13 = t7a, q8,q15 = t6a
+
+        dbutterfly_n    d18, d19, d8,  d9,  q4,  q5,  q8,  q15, q9,  q14, q4, q5 @ q9 = t2, q4 = t6
+        dbutterfly_n    d16, d17, d12, d13, q6,  q7,  q10, q13, q8,  q15, q6, q7 @ q8 = t3, q6 = t7
+
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+
+        butterfly       q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
+        vneg.s32        q15, q15          @ q15 = out[7]
+        butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
+
+        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d1[0], d1[1] @ q10,q11 = t5a, q5,q7 = t4a
+        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d1[1], d1[0] @ q2,q3 = t6a, q13,q14 = t7a
+
+        dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  q10, q11 @ q14 = out[6], q4 = t7
+
+        dmbutterfly0    d22, d23, d24, d25, q9,  q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
+        vneg.s32        q11, q11      @ q11 = out[3]
+
+        dbutterfly_n    d18, d19, d4,  d5,  q5,  q7,  q2,  q3,  q9, q10, q2,  q3 @ q9 = -out[1], q2 = t6
+        vneg.s32        q9,  q9       @ q9 = out[1]
+
+        dmbutterfly0    d20, d21, d26, d27, q2,  q4,  q3, q5,  d6,  d7,  d10, d11, q6,  q7 @ q10 = out[2], q13 = -out[5]
+        vneg.s32        q13, q13      @ q13 = out[5]
+.endm
+
+function idct8x8_dc_add_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {d0}, [r12,:64]
+
+        vmov.i32        q2,  #0
+        vmovl.s16       q0,  d0
+
+        vld1.32         {d16[]}, [r2,:32]
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vdup.32         q8,  d16[0]
+        vst1.32         {d4[0]}, [r2,:32]
+
+        vrshr.s32       q8,  q8,  #5
+        vdup.s16        q15, r8
+
+        mov             r3,  r0
+        mov             r12, #8
+1:
+        @ Loop to add the constant from q8 into all 8x8 outputs
+        subs            r12, r12, #2
+        vld1.16         {q2},  [r0,:128], r1
+        vaddw.u16       q10, q8,  d4
+        vld1.16         {q3},  [r0,:128], r1
+        vaddw.u16       q11, q8,  d5
+        vaddw.u16       q12, q8,  d6
+        vaddw.u16       q13, q8,  d7
+        vqmovun.s32     d4,  q10
+        vqmovun.s32     d5,  q11
+        vqmovun.s32     d6,  q12
+        vqmovun.s32     d7,  q13
+        vmin.u16        q2,  q2,  q15
+        vst1.16         {q2},  [r3,:128], r1
+        vmin.u16        q3,  q3,  q15
+        vst1.16         {q3},  [r3,:128], r1
+        bne             1b
+
+        pop             {r4-r8,pc}
+endfunc
+.ltorg
+
+.macro itxfm8_1d_funcs txfm
+@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it,
+@ transpose into a horizontal 8x4 slice and store.
+@ r0 = dst (temp buffer)
+@ r1 = slice offset
+@ r2 = src
+function \txfm\()8_1d_4x8_pass1_neon
+        mov             r12, #32
+        vmov.s32        q2,  #0
+.irp i, 8, 9, 10, 11, 12, 13, 14, 15
+        vld1.32         {q\i}, [r2,:128]
+        vst1.32         {q2},  [r2,:128], r12
+.endr
+
+        \txfm\()8
+
+        @ Do two 4x4 transposes. Originally, q8-q15 contain the
+        @ 8 rows. Afterwards, q8-q11, q12-q15 contain the transposed
+        @ 4x4 blocks.
+        transpose32_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 4x4 blocks horizontally.
+        cmp             r1,  #4
+        beq             1f
+.irp i, 8, 12, 9, 13, 10, 14, 11, 15
+        vst1.32         {q\i}, [r0,:128]!
+.endr
+        bx              lr
+1:
+        @ Special case: For the last input column (r1 == 4),
+        @ which would be stored as the last row in the temp buffer,
+        @ don't store the first 4x4 block, but keep it in registers
+        @ for the first slice of the second pass (where it is the
+        @ last 4x4 block).
+.irp i, 12, 13, 14, 15
+        add             r0,  r0,  #16
+        vst1.32         {q\i}, [r0,:128]!
+.endr
+        vmov            q12, q8
+        vmov            q13, q9
+        vmov            q14, q10
+        vmov            q15, q11
+        bx              lr
+endfunc
+
+@ Read a vertical 4x8 slice out of a 8x8 matrix, do a transform on it,
+@ load the destination pixels (from a similar 4x8 slice), add and store back.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+@ r3 = slice offset
+function \txfm\()8_1d_4x8_pass2_neon
+        mov             r12, #32
+.irp i, 8, 9, 10, 11
+        vld1.32         {q\i}, [r2,:128], r12
+.endr
+        cmp             r3,  #0
+        beq             1f
+.irp i, 12, 13, 14, 15
+        vld1.32         {q\i}, [r2,:128], r12
+.endr
+1:
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        \txfm\()8
+
+        vdup.s16        q4,  r8
+.macro load_add_store coef0, coef1, coef2, coef3
+        vld1.16         {d4},   [r0,:64], r1
+        vld1.16         {d5},   [r3,:64], r1
+        vld1.16         {d6},   [r0,:64], r1
+        vld1.16         {d7},   [r3,:64], r1
+
+        vrshr.s32       \coef0, \coef0, #5
+        vrshr.s32       \coef1, \coef1, #5
+        vrshr.s32       \coef2, \coef2, #5
+        vrshr.s32       \coef3, \coef3, #5
+
+        vaddw.u16       \coef0, \coef0, d4
+        vaddw.u16       \coef1, \coef1, d5
+        vaddw.u16       \coef2, \coef2, d6
+        vaddw.u16       \coef3, \coef3, d7
+
+        sub             r0,  r0,  r1, lsl #1
+        sub             r3,  r3,  r1, lsl #1
+
+        vqmovun.s32     d4,  \coef0
+        vqmovun.s32     d5,  \coef1
+        vqmovun.s32     d6,  \coef2
+        vqmovun.s32     d7,  \coef3
+
+        vmin.u16        q2,  q2,  q4
+        vmin.u16        q3,  q3,  q4
+
+        vst1.16         {d4},  [r0,:64], r1
+        vst1.16         {d5},  [r3,:64], r1
+        vst1.16         {d6},  [r0,:64], r1
+        vst1.16         {d7},  [r3,:64], r1
+.endm
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+.purgem load_add_store
+
+        bx              lr
+endfunc
+.endm
+
+itxfm8_1d_funcs idct
+itxfm8_1d_funcs iadst
+
+.macro itxfm_func8x8 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #1
+        beq             idct8x8_dc_add_neon
+.endif
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpush           {q4-q7}
+.else
+        vpush           {q4-q5}
+.endif
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #256
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+.ifc \txfm1,idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+.endif
+
+.irp i, 0, 4
+        add             r0,  sp,  #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i == 4
+        cmp             r3,  #12
+        ble             1f
+.endif
+.endif
+        mov             r1,  #\i
+        add             r2,  r6,  #(\i*4)
+        bl              \txfm1\()8_1d_4x8_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        @ For all-zero slices in pass 1, set q12-q15 to zero, for the in-register
+        @ passthrough of coefficients to pass 2 and clear the end of the temp buffer
+        vmov.i32        q12, #0
+        vmov.i32        q13, #0
+        vmov.i32        q14, #0
+        vmov.i32        q15, #0
+.rept 4
+        vst1.32         {q12-q13}, [r0,:128]!
+.endr
+3:
+.endif
+.ifc \txfm1\()_\txfm2,iadst_idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+.endif
+.irp i, 0, 4
+        add             r0,  r4,  #(\i*2)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*4)
+        mov             r3,  #\i
+        bl              \txfm2\()8_1d_4x8_pass2_neon
+.endr
+
+        add             sp,  sp,  r7
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpop            {q4-q7}
+.else
+        vpop            {q4-q5}
+.endif
+        pop             {r4-r8,pc}
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
+        push            {r4-r8,lr}
+        movw            r8,  #0x03ff
+        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
+        push            {r4-r8,lr}
+        movw            r8,  #0x0fff
+        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+.endm
+
+itxfm_func8x8 idct,  idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct,  iadst
+itxfm_func8x8 iadst, iadst
+
+function idct16x16_dc_add_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {d0}, [r12,:64]
+
+        vmov.i32        q2,  #0
+        vmovl.s16       q0,  d0
+
+        vld1.32         {d16[]}, [r2,:32]
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vdup.32         q8,  d16[0]
+        vst1.32         {d4[0]}, [r2,:32]
+
+        vrshr.s32       q8,  q8,  #6
+        vdup.s16        q15, r9
+
+        mov             r3,  r0
+        mov             r12, #16
+1:
+        @ Loop to add the constant from q8 into all 16x16 outputs
+        subs            r12, r12, #2
+        vld1.16         {q0-q1},  [r0,:128], r1
+        vaddw.u16       q9,  q8,  d0
+        vaddw.u16       q10, q8,  d1
+        vld1.16         {q2-q3},  [r0,:128], r1
+        vaddw.u16       q11, q8,  d2
+        vaddw.u16       q12, q8,  d3
+        vaddw.u16       q13, q8,  d4
+        vaddw.u16       q14, q8,  d5
+        vqmovun.s32     d0,  q9
+        vaddw.u16       q9,  q8,  d6
+        vqmovun.s32     d1,  q10
+        vaddw.u16       q10, q8,  d7
+        vqmovun.s32     d2,  q11
+        vqmovun.s32     d3,  q12
+        vqmovun.s32     d4,  q13
+        vqmovun.s32     d5,  q14
+        vmin.u16        q0,  q0,  q15
+        vmin.u16        q1,  q1,  q15
+        vqmovun.s32     d6,  q9
+        vqmovun.s32     d7,  q10
+        vst1.16         {q0-q1},  [r3,:128], r1
+        vmin.u16        q2,  q2,  q15
+        vmin.u16        q3,  q3,  q15
+        vst1.16         {q2-q3},  [r3,:128], r1
+        bne             1b
+
+        pop             {r4-r9,pc}
+endfunc
+.ltorg
+
+.macro idct16_end
+        butterfly       d18, d11, d8,  d11               @ d18 = t0a,  d11 = t7a
+        butterfly       d19, d22, d9,  d22               @ d19 = t1a,  d22 = t6
+        butterfly       d8,  d26, d20, d26               @ d8  = t2a,  d26 = t5
+        butterfly       d9,  d10, d28, d10               @ d9  = t3a,  d10 = t4
+        butterfly       d20, d28, d16, d24               @ d20 = t8a,  d28 = t11a
+        butterfly       d24, d21, d23, d21               @ d24 = t9,   d21 = t10
+        butterfly       d23, d27, d25, d27               @ d23 = t14,  d27 = t13
+        butterfly       d25, d29, d29, d17               @ d25 = t15a, d29 = t12a
+
+        mbutterfly0     d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
+        mbutterfly0     d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12,  d28 = t11
+
+        vswp            d27, d29                         @ d27 = t12, d29 = t13a
+        vswp            d28, d27                         @ d28 = t12, d27 = t11
+        butterfly       d16, d31, d18, d25               @ d16 = out[0], d31 = out[15]
+        butterfly       d17, d30, d19, d23               @ d17 = out[1], d30 = out[14]
+        butterfly_r     d25, d22, d22, d24               @ d25 = out[9], d22 = out[6]
+        butterfly       d23, d24, d11, d20               @ d23 = out[7], d24 = out[8]
+        butterfly       d18, d29, d8,  d29               @ d18 = out[2], d29 = out[13]
+        butterfly       d19, d28, d9,  d28               @ d19 = out[3], d28 = out[12]
+        vmov            d8,  d21                         @ d8  = t10a
+        butterfly       d20, d27, d10, d27               @ d20 = out[4], d27 = out[11]
+        butterfly       d21, d26, d26, d8                @ d21 = out[5], d26 = out[10]
+        bx              lr
+.endm
+
+function idct16
+        mbutterfly0     d16, d24, d16, d24, d8, d10, q4,  q5 @ d16 = t0a,  d24 = t1a
+        mbutterfly      d20, d28, d1[0], d1[1], q4,  q5  @ d20 = t2a,  d28 = t3a
+        mbutterfly      d18, d30, d2[0], d2[1], q4,  q5  @ d18 = t4a,  d30 = t7a
+        mbutterfly      d26, d22, d3[0], d3[1], q4,  q5  @ d26 = t5a,  d22 = t6a
+        mbutterfly      d17, d31, d4[0], d4[1], q4,  q5  @ d17 = t8a,  d31 = t15a
+        mbutterfly      d25, d23, d5[0], d5[1], q4,  q5  @ d25 = t9a,  d23 = t14a
+        mbutterfly      d21, d27, d6[0], d6[1], q4,  q5  @ d21 = t10a, d27 = t13a
+        mbutterfly      d29, d19, d7[0], d7[1], q4,  q5  @ d29 = t11a, d19 = t12a
+
+        butterfly       d8,  d28, d16, d28               @ d8  = t0,   d28 = t3
+        butterfly       d9,  d20, d24, d20               @ d9  = t1,   d20 = t2
+        butterfly       d10, d26, d18, d26               @ d10 = t4,   d26 = t5
+        butterfly       d11, d22, d30, d22               @ d11 = t7,   d22 = t6
+        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
+        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
+        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
+        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
+
+        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
+        mbutterfly      d23, d25, d1[0], d1[1], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d1[0], d1[1], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        idct16_end
+endfunc
+
+function idct16_half
+        mbutterfly0_h   d16, d24, d16, d24, d8, d10, q4,  q5 @ d16 = t0a,  d24 = t1a
+        mbutterfly_h1   d20, d28, d1[0], d1[1], q4,  q5  @ d20 = t2a,  d28 = t3a
+        mbutterfly_h1   d18, d30, d2[0], d2[1], q4,  q5  @ d18 = t4a,  d30 = t7a
+        mbutterfly_h2   d26, d22, d3[0], d3[1], q4,  q5  @ d26 = t5a,  d22 = t6a
+        mbutterfly_h1   d17, d31, d4[0], d4[1], q4,  q5  @ d17 = t8a,  d31 = t15a
+        mbutterfly_h2   d25, d23, d5[0], d5[1], q4,  q5  @ d25 = t9a,  d23 = t14a
+        mbutterfly_h1   d21, d27, d6[0], d6[1], q4,  q5  @ d21 = t10a, d27 = t13a
+        mbutterfly_h2   d29, d19, d7[0], d7[1], q4,  q5  @ d29 = t11a, d19 = t12a
+
+        butterfly       d8,  d28, d16, d28               @ d8  = t0,   d28 = t3
+        butterfly       d9,  d20, d24, d20               @ d9  = t1,   d20 = t2
+        butterfly       d10, d26, d18, d26               @ d10 = t4,   d26 = t5
+        butterfly       d11, d22, d30, d22               @ d11 = t7,   d22 = t6
+        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
+        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
+        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
+        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
+
+        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
+        mbutterfly      d23, d25, d1[0], d1[1], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d1[0], d1[1], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        vmov.s64        q12, #0
+        vmull.s32       q4,  d17, d4[0]
+        vmull.s32       q5,  d18, d2[1]
+        vmull.s32       q15, d18, d2[0]
+        vmlsl.s32       q12, d19, d7[1]
+        vmull.s32       q14, d17, d4[1]
+        vmull.s32       q13, d19, d7[0]
+        vmull.s32       q11, d16, d0[0]
+        vrshrn.s64      d16, q4,  #14
+        vrshrn.s64      d11, q5,  #14
+        vrshrn.s64      d10, q15, #14
+        vrshrn.s64      d24, q12, #14
+        vrshrn.s64      d29, q14, #14
+        vrshrn.s64      d17, q13, #14
+        vrshrn.s64      d28, q11, #14
+
+        mbutterfly_l    q10, q11, d17, d24, d1[0], d1[1], neg=1
+        mbutterfly_l    q9,  q15, d29, d16, d1[0], d1[1]
+        vrshrn.s64      d27, q10, #14
+        vrshrn.s64      d21, q11, #14
+        vrshrn.s64      d23, q9,  #14
+        vrshrn.s64      d25, q15, #14
+        vmov            d8,  d28
+        vmov            d9,  d28
+        mbutterfly0     d22, d26, d11, d10, d18, d30, q9,  q15
+        vmov            d20, d28
+        idct16_end
+endfunc
+
+function iadst16
+        movrel          r12, iadst16_coeffs
+        vld1.16         {q0},  [r12,:128]!
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+
+        mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = t0
+        mbutterfly_l    q5,  q4,  d23, d24, d2[1], d2[0] @ q5  = t9,   q4  = t8
+        butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = t9a
+        mbutterfly_l    q7,  q6,  d29, d18, d1[1], d1[0] @ q7  = t3,   q6  = t2
+        butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = t8a
+        mbutterfly_l    q3,  q2,  d21, d26, d3[1], d3[0] @ q3  = t11,  q2  = t10
+
+        vld1.16         {q0},  [r12,:128]!
+        butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = t11a
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+        mbutterfly_l    q5,  q4,  d27, d20, d0[1], d0[0] @ q5  = t5,   q4  = t4
+        butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = t10a
+
+        mbutterfly_l    q7,  q6,  d19, d28, d2[1], d2[0] @ q7  = t13,  q6  = t12
+        butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = t13a
+        mbutterfly_l    q3,  q2,  d25, d22, d1[1], d1[0] @ q3  = t7,   q2  = t6
+        butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = t12a
+
+        mbutterfly_l    q5,  q4,  d17, d30, d3[1], d3[0] @ q5  = t15,  q4  = t14
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+        butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = t15a
+        mbutterfly_l    q7,  q6,  d23, d24, d2[0], d2[1] @ q7  = t9,   q6  = t8
+        butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = t14a
+
+        mbutterfly_l    q2,  q3,  d28, d19, d2[1], d2[0] @ q2  = t12,  q3  = t13
+        butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = t12a
+        mbutterfly_l    q5,  q4,  d21, d26, d3[0], d3[1] @ q5  = t11,  q4  = t10
+        butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = t0
+        butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = t13a
+
+        mbutterfly_l    q6,  q7,  d30, d17, d3[1], d3[0] @ q6  = t14,  q7  = t15
+        butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = t1
+        butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = t14a
+        butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = t15a
+
+        butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = t2
+        butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = t3
+
+        mbutterfly_l    q5,  q4,  d19, d28, d1[0], d1[1] @ q5  = t13,  q4  = t12
+        mbutterfly_l    q6,  q7,  d30, d17, d1[1], d1[0] @ q6  = t14,  q7  = t15
+
+        butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   d30 = t14a
+        butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], d17 = t15a
+        vneg.s32        d29, d29                         @ d29 = out[13]
+
+        mbutterfly_l    q5,  q4,  d4,  d5,  d1[0], d1[1] @ q5  = t5a,  q4  = t4a
+        mbutterfly_l    q6,  q7,  d7,  d6,  d1[1], d1[0] @ q6  = t6a,  q7  = t7a
+
+        butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = t2a
+        butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = t10
+
+        butterfly_n     d19, d31, q4,  q6,  q2,  q4      @ d19 = -out[3],  d31 = t6
+        vneg.s32        d19, d19                         @ d19 = out[3]
+        butterfly_n     d28, d16, q5,  q7,  q2,  q5      @ d28 = out[12],  d16 = t7
+
+        butterfly       d5,  d8,  d20, d22               @ d5 =-out[15],d8 = t3a
+        butterfly       d4,  d9,  d24, d26               @ d4 = out[14],d9 = t11
+
+        mbutterfly0     d23, d24, d6,  d8,  d10, d11, q6,  q7, 1 @ d23 = out[7], d24 = out[8]
+        mbutterfly0     d20, d27, d16, d31, d10, d11, q6,  q7    @ d20 = out[4], d27 = out[11]
+        mbutterfly0     d22, d25, d9,  d7,  d10, d11, q6,  q7    @ d22 = out[6], d25 = out[9]
+        mbutterfly0     d21, d26, d30, d17, d10, d11, q6,  q7, 1 @ d21 = out[5], d26 = out[10]
+
+        vneg.s32        d31, d5                          @ d31 = out[15]
+        vneg.s32        d17, d3                          @ d17 = out[1]
+
+        vmov            d16, d2
+        vmov            d30, d4
+        bx              lr
+endfunc
+
+.macro itxfm16_1d_funcs txfm, suffix
+@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
+@ transpose into a horizontal 16x2 slice and store.
+@ r0 = dst (temp buffer)
+@ r2 = src
+function \txfm\()16_1d_2x16_pass1\suffix\()_neon
+        push            {lr}
+
+        mov             r12, #64
+        vmov.s32        q4,  #0
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+
+        bl              \txfm\()16\suffix
+
+        @ Do eight 2x2 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
+        @ transposed 2x2 blocks.
+        transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 2x2 blocks horizontally.
+.irp i, 16, 18, 20, 22, 24, 26, 28, 30, 17, 19, 21, 23, 25, 27, 29, 31
+        vst1.32         {d\i}, [r0,:64]!
+.endr
+        pop             {pc}
+endfunc
+
+@ Read a vertical 2x16 slice out of a 16x16 matrix, do a transform on it,
+@ load the destination pixels (from a similar 2x16 slice), add and store back.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+function \txfm\()16_1d_2x16_pass2\suffix\()_neon
+        push            {lr}
+
+        mov             r12, #64
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19, 20
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+.endif
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        bl              \txfm\()16\suffix
+
+.macro load_add_store coef0, coef1, coef2, coef3
+        vrshr.s32       \coef0, \coef0, #6
+        vrshr.s32       \coef1, \coef1, #6
+
+        vld1.32         {d8[]},   [r0,:32], r1
+        vld1.32         {d8[1]},  [r3,:32], r1
+        vrshr.s32       \coef2, \coef2, #6
+        vrshr.s32       \coef3, \coef3, #6
+        vld1.32         {d9[]},   [r0,:32], r1
+        vld1.32         {d9[1]},  [r3,:32], r1
+        vaddw.u16       \coef0, \coef0, d8
+        vld1.32         {d10[]},  [r0,:32], r1
+        vld1.32         {d10[1]}, [r3,:32], r1
+        vaddw.u16       \coef1, \coef1, d9
+        vld1.32         {d11[]},  [r0,:32], r1
+        vld1.32         {d11[1]}, [r3,:32], r1
+
+        vqmovun.s32     d8,  \coef0
+        vdup.s16        q8,  r9
+        vqmovun.s32     d9,  \coef1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r3,  r3,  r1, lsl #2
+        vaddw.u16       \coef2, \coef2, d10
+        vaddw.u16       \coef3, \coef3, d11
+        vmin.u16        q4,  q4,  q8
+        vst1.32         {d8[0]},  [r0,:32], r1
+        vst1.32         {d8[1]},  [r3,:32], r1
+        vqmovun.s32     d10, \coef2
+        vst1.32         {d9[0]},  [r0,:32], r1
+        vst1.32         {d9[1]},  [r3,:32], r1
+        vqmovun.s32     d11, \coef3
+        vmin.u16        q5,  q5,  q8
+
+        vst1.32         {d10[0]}, [r0,:32], r1
+        vst1.32         {d10[1]}, [r3,:32], r1
+        vst1.32         {d11[0]}, [r0,:32], r1
+        vst1.32         {d11[1]}, [r3,:32], r1
+.endm
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+.purgem load_add_store
+
+        pop             {pc}
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+itxfm16_1d_funcs idct, _quarter
+itxfm16_1d_funcs idct, _half
+.ltorg
+
+@ This is the minimum eob value for each subpartition, in increments of 2
+const min_eob_idct_idct_16, align=4
+        .short  0, 3, 10, 22, 38, 62, 89, 121
+endconst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #1
+        beq             idct16x16_dc_add_neon
+.endif
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpush           {q4-q7}
+.else
+        vpush           {q4-q5}
+.endif
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #1024
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+.ifc \txfm1,idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #10
+        ble             idct16x16_quarter_add_16_neon
+        cmp             r3,  #38
+        ble             idct16x16_half_add_16_neon
+
+        movrel          r8,  min_eob_idct_idct_16 + 2
+.endif
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+        add             r0,  sp,  #(\i*64)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(16 - \i)/2
+        ble             1f
+.endif
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              \txfm1\()16_1d_2x16_pass1_neon
+.endr
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        vmov.i32        q14, #0
+        vmov.i32        q15, #0
+2:
+        subs            r1,  r1,  #1
+        @ Unroll for 2 lines
+.rept 2
+        @ Fill one line with zeros
+        vst1.32         {q14-q15}, [r0,:128]!
+        vst1.32         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
+.endif
+
+.ifc \txfm1\()_\txfm2,iadst_idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+.endif
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+        add             r0,  r4,  #(\i*2)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*4)
+        bl              \txfm2\()16_1d_2x16_pass2_neon
+.endr
+
+        add             sp,  sp,  r7
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpop            {q4-q7}
+.else
+        vpop            {q4-q5}
+.endif
+        pop             {r4-r9,pc}
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
+        push            {r4-r9,lr}
+        movw            r9,  #0x03ff
+        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
+        push            {r4-r9,lr}
+        movw            r9,  #0x0fff
+        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+.endm
+
+itxfm_func16x16 idct,  idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct,  iadst
+itxfm_func16x16 iadst, iadst
+.ltorg
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_16_neon
+.irp i, 0, 2
+        add             r0,  sp,  #(\i*64)
+.ifc \size,quarter
+.if \i == 2
+        cmp             r3,  #3
+        ble             1f
+.endif
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              idct16_1d_2x16_pass1_\size\()_neon
+.endr
+
+.ifc \size,half
+.irp i, 4, 6
+        add             r0,  sp,  #(\i*64)
+.if \i == 6
+        cmp             r3,  #22
+        ble             1f
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              idct16_1d_2x16_pass1_\size\()_neon
+.endr
+.endif
+
+        b               3f
+1:
+        vmov.i32        q14, #0
+        vmov.i32        q15, #0
+
+        @ Unroll for 2 lines
+.rept 2
+        @ Fill one line with zeros
+        vst1.32         {q14-q15}, [r0,:128]!
+        vst1.32         {q14-q15}, [r0,:128]!
+.endr
+
+3:
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14
+        add             r0,  r4,  #(\i*2)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*4)
+        bl              idct16_1d_2x16_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q5}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {d0}, [r12,:64]
+
+        vmov.i32        q2,  #0
+        vmovl.s16       q0,  d0
+
+        vld1.32         {d16[]}, [r2,:32]
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vmull.s32       q8,  d16, d0[0]
+        vrshrn.s64      d16, q8,  #14
+        vdup.32         q8,  d16[0]
+        vst1.32         {d4[0]}, [r2,:32]
+
+        vrshr.s32       q8,  q8,  #6
+        vdup.s16        q15, r9
+
+        mov             r3,  r0
+        mov             r12, #32
+        sub             r1,  r1,  #32
+1:
+        @ Loop to add the constant from q8 into all 32x32 outputs
+        subs            r12, r12, #1
+        vld1.16         {q0-q1},  [r0,:128]!
+        vaddw.u16       q9,  q8,  d0
+        vaddw.u16       q10, q8,  d1
+        vld1.16         {q2-q3},  [r0,:128], r1
+        vaddw.u16       q11, q8,  d2
+        vaddw.u16       q12, q8,  d3
+        vaddw.u16       q13, q8,  d4
+        vaddw.u16       q14, q8,  d5
+        vqmovun.s32     d0,  q9
+        vaddw.u16       q9,  q8,  d6
+        vqmovun.s32     d1,  q10
+        vaddw.u16       q10, q8,  d7
+        vqmovun.s32     d2,  q11
+        vqmovun.s32     d3,  q12
+        vqmovun.s32     d4,  q13
+        vqmovun.s32     d5,  q14
+        vmin.u16        q0,  q0,  q15
+        vmin.u16        q1,  q1,  q15
+        vqmovun.s32     d6,  q9
+        vqmovun.s32     d7,  q10
+        vst1.16         {q0-q1},  [r3,:128]!
+        vmin.u16        q2,  q2,  q15
+        vmin.u16        q3,  q3,  q15
+        vst1.16         {q2-q3},  [r3,:128], r1
+        bne             1b
+
+        pop             {r4-r9,pc}
+endfunc
+
+.macro idct32_end
+        butterfly       d16, d9,  d8,  d9  @ d16 = t16a, d9  = t19a
+        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
+        butterfly       d18, d10, d11, d10 @ d18 = t23a, d10 = t20a
+        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
+        butterfly       d8,  d28, d28, d30 @ d8  = t24a, d28 = t27a
+        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
+        butterfly       d11, d29, d29, d31 @ d11 = t31a, d29 = t28a
+        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
+
+        mbutterfly      d27, d20, d1[0], d1[1], q12, q15        @ d27 = t18a, d20 = t29a
+        mbutterfly      d29, d9,  d1[0], d1[1], q12, q15        @ d29 = t19,  d9  = t28
+        mbutterfly      d28, d10, d1[0], d1[1], q12, q15, neg=1 @ d28 = t27,  d10 = t20
+        mbutterfly      d26, d21, d1[0], d1[1], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
+
+        butterfly       d31, d24, d11, d8  @ d31 = t31,  d24 = t24
+        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
+        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
+        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
+        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
+        butterfly_r     d27, d28, d9,  d28 @ d27 = t27a, d28 = t28a
+        butterfly       d8,  d26, d20, d26 @ d8  = t29,  d26 = t26
+        butterfly       d19, d20, d29, d10 @ d19 = t19a, d20 = t20
+        vmov            d29, d8            @ d29 = t29
+
+        mbutterfly0     d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27,  d20 = t20
+        mbutterfly0     d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
+        mbutterfly0     d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25,  d22 = t22
+        mbutterfly0     d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
+        bx              lr
+.endm
+
+function idct32_odd
+        movrel          r12, idct_coeffs
+
+        @ Overwrite the idct16 coeffs with the stored ones for idct32
+        vmovl.s16       q0,  d12
+        vmovl.s16       q1,  d13
+        vmovl.s16       q2,  d14
+        vmovl.s16       q3,  d15
+
+        mbutterfly      d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a
+        mbutterfly      d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a
+        mbutterfly      d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a
+        mbutterfly      d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a
+        mbutterfly      d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a
+        mbutterfly      d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a
+        mbutterfly      d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a
+        mbutterfly      d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a
+
+        @ Reload the idct16 coefficients. We could swap the coefficients between
+        @ q0-q3 and q6-q7 by narrowing/lengthening, but that's slower than just
+        @ loading and lengthening.
+        vld1.16         {q0-q1}, [r12,:128]
+
+        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
+        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
+        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
+        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
+        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
+        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
+        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+        mbutterfly      d23, d24, d2[0], d2[1], q8, q9        @ d23 = t17a, d24 = t30a
+        mbutterfly      d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d3[0], d3[1], q8, q9        @ d21 = t21a, d26 = t26a
+        mbutterfly      d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_half
+        movrel          r12, idct_coeffs
+
+        vmovl.s16       q0,  d12
+        vmovl.s16       q1,  d13
+        vmovl.s16       q2,  d14
+        vmovl.s16       q3,  d15
+
+        mbutterfly_h1   d16, d31, d0[0], d0[1], q4, q5 @ d16 = t16a, d31 = t31a
+        mbutterfly_h2   d24, d23, d1[0], d1[1], q4, q5 @ d24 = t17a, d23 = t30a
+        mbutterfly_h1   d20, d27, d2[0], d2[1], q4, q5 @ d20 = t18a, d27 = t29a
+        mbutterfly_h2   d28, d19, d3[0], d3[1], q4, q5 @ d28 = t19a, d19 = t28a
+        mbutterfly_h1   d18, d29, d4[0], d4[1], q4, q5 @ d18 = t20a, d29 = t27a
+        mbutterfly_h2   d26, d21, d5[0], d5[1], q4, q5 @ d26 = t21a, d21 = t26a
+        mbutterfly_h1   d22, d25, d6[0], d6[1], q4, q5 @ d22 = t22a, d25 = t25a
+        mbutterfly_h2   d30, d17, d7[0], d7[1], q4, q5 @ d30 = t23a, d17 = t24a
+
+        vld1.16         {q0-q1}, [r12,:128]
+
+        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
+        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
+        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
+        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
+        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
+        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
+        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+        mbutterfly      d23, d24, d2[0], d2[1], q8, q9        @ d23 = t17a, d24 = t30a
+        mbutterfly      d27, d20, d2[0], d2[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d3[0], d3[1], q8, q9        @ d21 = t21a, d26 = t26a
+        mbutterfly      d25, d22, d3[0], d3[1], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        movrel          r12, idct_coeffs
+
+        vmovl.s16       q0,  d12
+        vmovl.s16       q1,  d13
+        vmovl.s16       q2,  d14
+        vmovl.s16       q3,  d15
+
+        vmov.s64        q14, #0
+        vmov.s64        q5,  #0
+
+        vmull.s32       q4,  d16, d0[0]
+        vmlsl.s32       q14, d19, d3[1]
+        vmull.s32       q15, d16, d0[1]
+        vmull.s32       q11, d17, d7[0]
+        vmlsl.s32       q5,  d17, d7[1]
+        vmull.s32       q13, d19, d3[0]
+        vmull.s32       q10, d18, d4[0]
+        vmull.s32       q12, d18, d4[1]
+
+        vld1.16         {q0-q1}, [r12,:128]
+
+        vrshrn.s64      d8,  q4,  #14
+        vrshrn.s64      d9,  q14, #14
+        vrshrn.s64      d29, q15, #14
+        vrshrn.s64      d28, q11, #14
+
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+
+        vrshrn.s64      d11, q5,  #14
+        vrshrn.s64      d31, q13, #14
+        vrshrn.s64      d10, q10, #14
+        vrshrn.s64      d30, q12, #14
+
+        mbutterfly_l    q8,  q9,  d29, d8,  d2[0], d2[1]
+        mbutterfly_l    q13, q10, d31, d9,  d2[0], d2[1], neg=1
+        vrshrn.s64      d23, q8,  #14
+        vrshrn.s64      d24, q9,  #14
+        vrshrn.s64      d27, q13, #14
+        vrshrn.s64      d20, q10, #14
+        mbutterfly_l    q8,  q9,  d30, d10, d3[0], d3[1]
+        vrshrn.s64      d21, q8,  #14
+        vrshrn.s64      d26, q9,  #14
+        mbutterfly_l    q8,  q9,  d28, d11, d3[0], d3[1], neg=1
+        vrshrn.s64      d25, q8,  #14
+        vrshrn.s64      d22, q9,  #14
+
+        idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+@ Do an 32-point IDCT of a 2x32 slice out of a 32x32 matrix.
+@ We don't have register space to do a single pass IDCT of 2x32 though,
+@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
+@ a normal IDCT16 with every other input component (the even ones, with
+@ each output written twice), followed by a separate 16-point IDCT
+@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
+@ r0 = dst (temp buffer)
+@ r1 = unused
+@ r2 = src
+function idct32_1d_2x32_pass1\suffix\()_neon
+        push            {lr}
+
+        @ Double stride of the input, since we only read every other line
+        mov             r12, #256
+        vmov.s32        d8,  #0
+
+        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.32         {d\i}, [r2,:64]
+        vst1.32         {d8},  [r2,:64], r12
+.endr
+.endif
+
+        bl              idct16\suffix
+
+        @ Do eight 2x2 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d17, d18-d19 etc contain the eight
+        @ transposed 2x2 blocks.
+        transpose32_8x_2x2 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the registers a, b, c, d, e, f, g, h horizontally, followed
+        @ by the same registers h, g, f, e, d, c, b, a mirrored.
+.macro store_rev a, b, c, d, e, f, g, h
+.irp i, \a, \b, \c, \d, \e, \f, \g, \h
+        vst1.32         {d\i}, [r0,:64]!
+        vrev64.32       d\i, d\i
+.endr
+.irp i, \h, \g, \f, \e, \d, \c, \b, \a
+        vst1.32         {d\i}, [r0,:64]!
+.endr
+.endm
+        store_rev       16, 18, 20, 22, 24, 26, 28, 30
+        store_rev       17, 19, 21, 23, 25, 27, 29, 31
+        sub             r0,  r0,  #256
+.purgem store_rev
+
+        @ Move r2 back to the start of the input, and move
+        @ to the first odd row
+.ifb \suffix
+        sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             r2,  r2,  r12, lsl #3
+.endif
+        add             r2,  r2,  #128
+
+        vmov.s32        d8,  #0
+        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d8},  [r2,:64], r12
+.endr
+.endif
+
+        bl              idct32_odd\suffix
+
+        transpose32_8x_2x2 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+
+        @ Store the registers a, b, c, d, e, f, g, h horizontally,
+        @ adding into the output first, and then mirrored, subtracted
+        @ from the output.
+.macro store_rev a, b, c, d, e, f, g, h
+.irp i, \a, \b, \c, \d, \e, \f, \g, \h
+        vld1.32         {d8},  [r0,:64]
+        vadd.s32        d8, d8, d\i
+        vst1.32         {d8},  [r0,:64]!
+        vrev64.32       d\i, d\i
+.endr
+.irp i, \h, \g, \f, \e, \d, \c, \b, \a
+        vld1.32         {d8},  [r0,:64]
+        vsub.s32        d8, d8, d\i
+        vst1.32         {d8},  [r0,:64]!
+.endr
+.endm
+
+        store_rev       31, 29, 27, 25, 23, 21, 19, 17
+        store_rev       30, 28, 26, 24, 22, 20, 18, 16
+.purgem store_rev
+        pop             {pc}
+endfunc
+.ltorg
+
+@ This is mostly the same as 2x32_pass1, but without the transpose,
+@ and use the source as temp buffer between the two idct passes, and
+@ add into the destination.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+function idct32_1d_2x32_pass2\suffix\()_neon
+        push            {lr}
+
+        mov             r12, #256
+        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #3
+.endif
+
+        bl              idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vst1.32         {d\i}, [r2,:64], r12
+.endr
+
+        sub             r2,  r2,  r12, lsl #4
+        add             r2,  r2,  #128
+
+        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.32         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #3
+.endif
+        sub             r2,  r2,  #128
+
+        bl              idct32_odd\suffix
+
+        @ Narrow the ict16 coefficients in q0-q3 into q0-q1, to
+        @ allow clobbering q2-q3 below.
+        vmovn.s32       d0,  q0
+        vmovn.s32       d1,  q1
+        vmovn.s32       d2,  q2
+        vmovn.s32       d3,  q3
+
+        mov             r12, #256
+        vdup.s16        q4,  r9
+.macro load_acc_store a, b, c, d, neg=0
+        vld1.32         {d4},  [r2,:64], r12
+        vld1.32         {d5},  [r2,:64], r12
+.if \neg == 0
+        vadd.s32        d4,  d4,  d\a
+        vld1.32         {d6},  [r2,:64], r12
+        vadd.s32        d5,  d5,  d\b
+        vld1.32         {d7},  [r2,:64], r12
+        vadd.s32        d6,  d6,  d\c
+        vadd.s32        d7,  d7,  d\d
+.else
+        vsub.s32        d4,  d4,  d\a
+        vld1.32         {d6},  [r2,:64], r12
+        vsub.s32        d5,  d5,  d\b
+        vld1.32         {d7},  [r2,:64], r12
+        vsub.s32        d6,  d6,  d\c
+        vsub.s32        d7,  d7,  d\d
+.endif
+        vld1.32         {d10[]},  [r0,:32], r1
+        vld1.32         {d10[1]}, [r0,:32], r1
+        vrshr.s32       q2,  q2,  #6
+        vld1.32         {d11[]},  [r0,:32], r1
+        vrshr.s32       q3,  q3,  #6
+        vld1.32         {d11[1]}, [r0,:32], r1
+        sub             r0,  r0,  r1, lsl #2
+        vaddw.u16       q2,  q2,  d10
+        vaddw.u16       q3,  q3,  d11
+        vqmovun.s32     d4,  q2
+        vqmovun.s32     d5,  q3
+        vmin.u16        q2,  q2,  q4
+        vst1.32         {d4[0]},  [r0,:32], r1
+        vst1.32         {d4[1]},  [r0,:32], r1
+        vst1.32         {d5[0]},  [r0,:32], r1
+        vst1.32         {d5[1]},  [r0,:32], r1
+.endm
+        load_acc_store  31, 30, 29, 28
+        load_acc_store  27, 26, 25, 24
+        load_acc_store  23, 22, 21, 20
+        load_acc_store  19, 18, 17, 16
+        sub             r2,  r2,  r12
+        neg             r12, r12
+        load_acc_store  16, 17, 18, 19, 1
+        load_acc_store  20, 21, 22, 23, 1
+        load_acc_store  24, 25, 26, 27, 1
+        load_acc_store  28, 29, 30, 31, 1
+.purgem load_acc_store
+        @ Lengthen the idct16 coeffs back into 32 bit form
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+        pop             {pc}
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+        .short  0, 3, 9, 21, 34, 51, 70, 98, 135, 176, 240, 258, 336, 357, 448, 472
+endconst
+
+function vp9_idct_idct_32x32_add_16_neon
+        cmp             r3,  #1
+        beq             idct32x32_dc_add_neon
+        vpush           {q4-q7}
+        movrel          r8,  min_eob_idct_idct_32 + 2
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #4096
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]!
+        vld1.16         {q6-q7}, [r12,:128]
+        vmovl.s16       q2,  d2
+        vmovl.s16       q3,  d3
+        vmovl.s16       q1,  d1
+        vmovl.s16       q0,  d0
+
+        cmp             r3,  #34
+        ble             idct32x32_quarter_add_16_neon
+        cmp             r3,  #135
+        ble             idct32x32_half_add_16_neon
+
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+        add             r0,  sp,  #(\i*128)
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(32 - \i)/2
+        ble             1f
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              idct32_1d_2x32_pass1_neon
+.endr
+        b               3f
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+2:
+        subs            r1,  r1,  #1
+.rept 2
+        @ Fill one line with zeros
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+        add             r0,  r4,  #(\i*2)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*4)
+        bl              idct32_1d_2x32_pass2_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_10_neon, export=1
+        push            {r4-r9,lr}
+        movw            r9,  #0x03ff
+        b               vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_12_neon, export=1
+        push            {r4-r9,lr}
+        movw            r9,  #0x0fff
+        b               vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+.macro idct32_partial size, rows
+function idct32x32_\size\()_add_16_neon
+.irp i, 0, 2, 4, 6
+        add             r0,  sp,  #(\i*128)
+.ifc \size,quarter
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(\rows - \i)/2
+        ble             1f
+.endif
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              idct32_1d_2x32_pass1_\size\()_neon
+.endr
+.ifc \size,half
+        add             r8,  r8,  #8
+.irp i, 8, 10, 12, 14
+        add             r0,  sp,  #(\i*128)
+.if \i > 8
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(\rows - \i)/2
+        ble             1f
+.endif
+        add             r2,  r6,  #(\i*4)
+        bl              idct32_1d_2x32_pass1_\size\()_neon
+.endr
+.endif
+        b               3f
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+2:
+        subs            r1,  r1,  #1
+.rept 2
+        @ Fill one line with zeros
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
+.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+        add             r0,  r4,  #(\i*2)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*4)
+        bl              idct32_1d_2x32_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+idct32_partial quarter, 8
+idct32_partial half, 16
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
new file mode 100644
index 0000000000..6c09922cae
--- /dev/null
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -0,0 +1,1688 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+        .short  11585, 0, 6270, 15137
+iadst4_coeffs:
+        .short  5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+@ Do four 4x4 transposes, using q registers for the subtransposes that don't
+@ need to address the individual d registers.
+@ r0,r1 == rq1, r2,r3 == rq1, etc
+.macro transpose16_q_4x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        vtrn.32          \rq0, \rq1
+        vtrn.32          \rq2, \rq3
+        vtrn.32          \rq4, \rq5
+        vtrn.32          \rq6, \rq7
+        vtrn.16          \r0,  \r1
+        vtrn.16          \r2,  \r3
+        vtrn.16          \r4,  \r5
+        vtrn.16          \r6,  \r7
+        vtrn.16          \r8,  \r9
+        vtrn.16          \r10, \r11
+        vtrn.16          \r12, \r13
+        vtrn.16          \r14, \r15
+.endm
+
+@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ in/out are d registers
+.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
+        vadd.s16        \tmpd1, \in1,  \in2
+        vsub.s16        \tmpd2, \in1,  \in2
+        vmull.s16       \tmpq3, \tmpd1, d0[0]
+        vmull.s16       \tmpq4, \tmpd2, d0[0]
+.if \neg > 0
+        vneg.s32        \tmpq3, \tmpq3
+.endif
+        vrshrn.s32      \out1, \tmpq3, #14
+        vrshrn.s32      \out2, \tmpq4, #14
+.endm
+
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+        vmull.s16       \tmpq3, \in1, d0[0]
+        vrshrn.s32      \out1,  \tmpq3, #14
+        vrshrn.s32      \out2,  \tmpq3, #14
+.endm
+
+@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ Same as mbutterfly0, but with input being 2 q registers, output
+@ being 4 d registers.
+@ This can do with either 4 or 6 temporary q registers.
+.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
+        vadd.s16        \tmpq1, \in1,  \in2
+        vsub.s16        \tmpq2, \in1,  \in2
+        vmull.s16       \tmpq3, \tmpd11, d0[0]
+        vmull.s16       \tmpq4, \tmpd12, d0[0]
+.ifb \tmpq5
+        vrshrn.s32      \out1, \tmpq3, #14
+        vrshrn.s32      \out2, \tmpq4, #14
+        vmull.s16       \tmpq3, \tmpd21, d0[0]
+        vmull.s16       \tmpq4, \tmpd22, d0[0]
+        vrshrn.s32      \out3, \tmpq3, #14
+        vrshrn.s32      \out4, \tmpq4, #14
+.else
+        vmull.s16       \tmpq5, \tmpd21, d0[0]
+        vmull.s16       \tmpq6, \tmpd22, d0[0]
+        vrshrn.s32      \out1, \tmpq3, #14
+        vrshrn.s32      \out2, \tmpq4, #14
+        vrshrn.s32      \out3, \tmpq5, #14
+        vrshrn.s32      \out4, \tmpq6, #14
+.endif
+.endm
+
+@ out1 = in1 * coef1 - in2 * coef2
+@ out2 = in1 * coef2 + in2 * coef1
+@ out are 2 q registers, in are 2 d registers
+.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2
+        vmull.s16       \out1, \in1, \coef1
+        vmlsl.s16       \out1, \in2, \coef2
+        vmull.s16       \out2, \in1, \coef2
+        vmlal.s16       \out2, \in2, \coef1
+.endm
+
+@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
+@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
+@ out are 4 q registers, in are 4 d registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
+        vmull.s16       \out1, \in1, \coef1
+        vmull.s16       \out2, \in2, \coef1
+        vmull.s16       \out3, \in1, \coef2
+        vmull.s16       \out4, \in2, \coef2
+        vmlsl.s16       \out1, \in3, \coef2
+        vmlsl.s16       \out2, \in4, \coef2
+        vmlal.s16       \out3, \in3, \coef1
+        vmlal.s16       \out4, \in4, \coef1
+.endm
+
+@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+@ inout are 2 d registers, tmp are 2 q registers
+.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0
+        mbutterfly_l    \tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2
+.if \neg > 0
+        vneg.s32        \tmp2, \tmp2
+.endif
+        vrshrn.s32      \inout1, \tmp1,  #14
+        vrshrn.s32      \inout2, \tmp2,  #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout2 as zero
+.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
+        vmull.s16       \tmp1,   \inout1, \coef1
+        vmull.s16       \tmp2,   \inout1, \coef2
+        vrshrn.s32      \inout1, \tmp1,   #14
+        vrshrn.s32      \inout2, \tmp2,   #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout1 as zero
+.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
+        vmull.s16       \tmp1,   \inout2, \coef2
+        vmull.s16       \tmp2,   \inout2, \coef1
+        vneg.s32        \tmp1,   \tmp1
+        vrshrn.s32      \inout2, \tmp2,   #14
+        vrshrn.s32      \inout1, \tmp1,   #14
+.endm
+
+@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
+@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
+@ inout are 4 d registers, tmp are 4 q registers
+.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
+        vrshrn.s32      \inout1, \tmp1,  #14
+        vrshrn.s32      \inout2, \tmp2,  #14
+        vrshrn.s32      \inout3, \tmp3,  #14
+        vrshrn.s32      \inout4, \tmp4,  #14
+.endm
+
+@ out1 = in1 + in2
+@ out2 = in1 - in2
+.macro butterfly out1, out2, in1, in2
+        vadd.s16        \out1, \in1, \in2
+        vsub.s16        \out2, \in1, \in2
+.endm
+
+@ out1 = in1 - in2
+@ out2 = in1 + in2
+.macro butterfly_r out1, out2, in1, in2
+        vsub.s16        \out1, \in1, \in2
+        vadd.s16        \out2, \in1, \in2
+.endm
+
+@ out1 = (in1 + in2 + (1 << 13)) >> 14
+@ out2 = (in1 - in2 + (1 << 13)) >> 14
+@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
+.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
+        vadd.s32        \tmp1, \in1, \in2
+        vsub.s32        \tmp2, \in1, \in2
+        vrshrn.s32      \out1, \tmp1,  #14
+        vrshrn.s32      \out2, \tmp2,  #14
+.endm
+
+@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
+.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+        vadd.s32        \tmp1, \in1, \in3
+        vadd.s32        \tmp2, \in2, \in4
+        vsub.s32        \tmp3, \in1, \in3
+        vsub.s32        \tmp4, \in2, \in4
+        vrshrn.s32      \out1, \tmp1,  #14
+        vrshrn.s32      \out2, \tmp2,  #14
+        vrshrn.s32      \out3, \tmp3,  #14
+        vrshrn.s32      \out4, \tmp4,  #14
+.endm
+
+
+.macro iwht4 c0, c1, c2, c3
+        vadd.i16        \c0,  \c0,  \c1
+        vsub.i16        d17,  \c2,  \c3
+        vsub.i16        d16,  \c0,  d17
+        vshr.s16        d16,  d16,  #1
+        vsub.i16        \c2,  d16,  \c1
+        vsub.i16        \c1,  d16,  \c3
+        vadd.i16        \c3,  d17,  \c2
+        vsub.i16        \c0,  \c0,  \c1
+.endm
+
+.macro idct4 c0, c1, c2, c3
+        vmull.s16       q13,  \c1,  d0[3]
+        vmull.s16       q11,  \c1,  d0[2]
+        vadd.i16        d16,  \c0,  \c2
+        vsub.i16        d17,  \c0,  \c2
+        vmlal.s16       q13,  \c3,  d0[2]
+        vmull.s16       q9,   d16,  d0[0]
+        vmull.s16       q10,  d17,  d0[0]
+        vmlsl.s16       q11,  \c3,  d0[3]
+        vrshrn.s32      d26,  q13,  #14
+        vrshrn.s32      d18,  q9,   #14
+        vrshrn.s32      d20,  q10,  #14
+        vrshrn.s32      d22,  q11,  #14
+        vadd.i16        \c0,  d18,  d26
+        vsub.i16        \c3,  d18,  d26
+        vadd.i16        \c1,  d20,  d22
+        vsub.i16        \c2,  d20,  d22
+.endm
+
+.macro iadst4 c0, c1, c2, c3
+        vmull.s16       q10,  \c0,  d1[0]
+        vmlal.s16       q10,  \c2,  d1[1]
+        vmlal.s16       q10,  \c3,  d1[2]
+        vmull.s16       q11,  \c0,  d1[2]
+        vmlsl.s16       q11,  \c2,  d1[0]
+        vsub.s16        \c0,  \c0,  \c2
+        vmlsl.s16       q11,  \c3,  d1[1]
+        vadd.s16        \c0,  \c0,  \c3
+        vmull.s16       q13,  \c1,  d1[3]
+        vmull.s16       q12,  \c0,  d1[3]
+        vadd.s32        q14,  q10,  q13
+        vadd.s32        q1,   q11,  q13
+        vrshrn.s32      \c0,  q14,  #14
+        vadd.s32        q10,  q10,  q11
+        vrshrn.s32      \c1,  q1,   #14
+        vsub.s32        q10,  q10,  q13
+        vrshrn.s32      \c2,  q12,  #14
+        vrshrn.s32      \c3,  q10,  #14
+.endm
+
+@ The public functions in this file have got the following signature:
+@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+        movrel          r12, itxfm4_coeffs
+        vld1.16         {d0}, [r12,:64]
+.endif
+.ifc \txfm1,iadst
+        movrel          r12, iadst4_coeffs
+        vld1.16         {d1}, [r12,:64]
+.endif
+.else
+        movrel          r12, itxfm4_coeffs
+        vld1.16         {q0}, [r12,:128]
+.endif
+
+        vmov.i16        q15, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #1
+        bne             1f
+        @ DC-only for idct/idct
+        vld1.16         {d4[]},   [r2,:16]
+        vmull.s16       q2,  d4,  d0[0]
+        vrshrn.s32      d4,  q2,  #14
+        vmull.s16       q2,  d4,  d0[0]
+        vrshrn.s32      d4,  q2,  #14
+        vst1.16         {d30[0]}, [r2,:16]
+        vdup.16         q2,  d4[0]
+        vmov            q3,  q2
+        b               2f
+.endif
+
+1:
+        vld1.16         {d4-d7},  [r2,:128]
+        vst1.16         {q15}, [r2,:128]!
+
+.ifc \txfm1,iwht
+        vshr.s16        q2,  q2,  #2
+        vshr.s16        q3,  q3,  #2
+.endif
+
+        \txfm1\()4      d4,  d5,  d6,  d7
+
+        vst1.16         {q15}, [r2,:128]!
+        @ Transpose 4x4 with 16 bit elements
+        vtrn.16         d4,  d5
+        vtrn.16         d6,  d7
+        vtrn.32         q2,  q3
+
+        \txfm2\()4      d4,  d5,  d6,  d7
+2:
+        vld1.32         {d0[]},   [r0,:32], r1
+        vld1.32         {d0[1]},  [r0,:32], r1
+.ifnc \txfm1,iwht
+        vrshr.s16       q2,  q2,  #4
+        vrshr.s16       q3,  q3,  #4
+.endif
+        vaddw.u8        q2,  q2,  d0
+        vld1.32         {d1[]},   [r0,:32], r1
+        vld1.32         {d1[1]},  [r0,:32], r1
+        vqmovun.s16     d0,  q2
+        sub             r0,  r0,  r1, lsl #2
+
+        vaddw.u8        q3,  q3,  d1
+        vst1.32         {d0[0]},  [r0,:32], r1
+        vqmovun.s16     d1,  q3
+
+        vst1.32         {d0[1]},  [r0,:32], r1
+        vst1.32         {d1[0]},  [r0,:32], r1
+        vst1.32         {d1[1]},  [r0,:32], r1
+
+        bx              lr
+endfunc
+.endm
+
+itxfm_func4x4 idct,  idct
+itxfm_func4x4 iadst, idct
+itxfm_func4x4 idct,  iadst
+itxfm_func4x4 iadst, iadst
+itxfm_func4x4 iwht,  iwht
+
+
+.macro idct8
+        dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
+        dmbutterfly     d20, d21, d28, d29, d0[2], d0[3], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
+        dmbutterfly     d18, d19, d30, d31, d1[0], d1[1], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
+        dmbutterfly     d26, d27, d22, d23, d1[2], d1[3], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
+
+        butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
+        butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
+        butterfly       q4,  q13, q9,  q13 @ q4 = t4, q13 = t5a
+        butterfly       q5,  q11, q15, q11 @ q5 = t7, q11 = t6a
+
+        butterfly       q8,  q15, q2,  q5  @ q8 = out[0], q15 = out[7]
+
+        dmbutterfly0    d4,  d5,  d10, d11, q11, q13, q9,  q13, d18, d19, d26, d27, q2,  q5, q11, q12 @ q2 = t6, q5 = t5
+
+        butterfly       q11, q12, q14, q4  @ q11 = out[3], q12 = out[4]
+        butterfly       q9,  q14, q3,  q2  @ q9 = out[1],  q14 = out[6]
+        butterfly_r     q13, q10, q10, q5  @ q13 = out[5], q10 = out[2]
+.endm
+
+.macro iadst8
+        dmbutterfly_l   q4,  q5,  q2,  q3,  d30, d31, d16, d17, d2[1], d2[0] @ q4,q5  = t1a, q2,q3 = t0a
+        dmbutterfly_l   q8,  q15, q6,  q7,  d22, d23, d24, d25, d3[1], d3[0] @ q8,q15 = t5a, q6,q7 = t4a
+
+        dbutterfly_n    d22, d23, d4,  d5,  q2,  q3,  q6,  q7,  q11, q12, q2,  q3 @ q11 = t0, q2 = t4
+
+        dbutterfly_n    d24, d25, d6,  d7,  q4,  q5,  q8,  q15, q12, q3,  q6,  q7 @ q12 = t1, q3 = t5
+
+        dmbutterfly_l   q6,  q7,  q4,  q5,  d26, d27, d20, d21, d2[3], d2[2] @ q6,q7 = t3a, q4,q5 = t2a
+        dmbutterfly_l   q10, q13, q8,  q15, d18, d19, d28, d29, d3[3], d3[2] @ q10,q13 = t7a, q8,q15 = t6a
+
+        dbutterfly_n    d18, d19, d8,  d9,  q4,  q5,  q8,  q15, q9,  q14, q4, q5 @ q9 = t2, q4 = t6
+        dbutterfly_n    d16, d17, d12, d13, q6,  q7,  q10, q13, q8,  q15, q6, q7 @ q8 = t3, q6 = t7
+
+        butterfly       q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
+        vneg.s16        q15, q15          @ q15 = out[7]
+        butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
+
+        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d0[2], d0[3] @ q10,q11 = t5a, q5,q7 = t4a
+        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d0[3], d0[2] @ q2,q3 = t6a, q13,q14 = t7a
+
+        dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  q10, q11 @ q14 = out[6], q4 = t7
+
+        dmbutterfly0    d22, d23, d24, d25, q9,  q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
+        vneg.s16        q11, q11      @ q11 = out[3]
+
+        dbutterfly_n    d18, d19, d4,  d5,  q5,  q7,  q2,  q3,  q9, q10, q2,  q3 @ q9 = -out[1], q2 = t6
+        vneg.s16        q9,  q9       @ q9 = out[1]
+
+        dmbutterfly0    d20, d21, d26, d27, q2,  q4,  q3, q5,  d6,  d7,  d10, d11, q6,  q7 @ q10 = out[2], q13 = -out[5]
+        vneg.s16        q13, q13      @ q13 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
+        @ Push q4-q7 if iadst is used, idct requires
+        @ a few scratch registers less, so only push q4-q5
+        @ if only idct is involved.
+        @ The iadst also uses a few coefficients from
+        @ idct, so those always need to be loaded.
+.ifc \txfm1\()_\txfm2,idct_idct
+        movrel          r12, idct_coeffs
+        vpush           {q4-q5}
+.else
+        movrel          r12, iadst8_coeffs
+        vld1.16         {q1}, [r12,:128]!
+        vpush           {q4-q7}
+.endif
+        vld1.16         {q0}, [r12,:128]
+
+        vmov.i16        q2, #0
+        vmov.i16        q3, #0
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #1
+        bne             1f
+        @ DC-only for idct/idct
+        vld1.16         {d16[]}, [r2,:16]
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vdup.16         q8,  d16[0]
+        vmov            q9,  q8
+        vmov            q10, q8
+        vmov            q11, q8
+        vmov            q12, q8
+        vmov            q13, q8
+        vmov            q14, q8
+        vmov            q15, q8
+        vst1.16         {d4[0]}, [r2,:16]
+        b               2f
+.endif
+1:
+        vld1.16         {q8-q9},    [r2,:128]!
+        vld1.16         {q10-q11},  [r2,:128]!
+        vld1.16         {q12-q13},  [r2,:128]!
+        vld1.16         {q14-q15},  [r2,:128]!
+        sub             r2,  r2,  #128
+        vst1.16         {q2-q3}, [r2,:128]!
+        vst1.16         {q2-q3}, [r2,:128]!
+        vst1.16         {q2-q3}, [r2,:128]!
+        vst1.16         {q2-q3}, [r2,:128]!
+
+        \txfm1\()8
+
+        @ Transpose 8x8 with 16 bit elements
+        vswp            d17, d24
+        vswp            d19, d26
+        vswp            d21, d28
+        vswp            d23, d30
+        transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
+
+        \txfm2\()8
+2:
+        mov             r3,  r0
+        @ Add into the destination
+        vld1.8          {d4},  [r0,:64], r1
+        vrshr.s16       q8,  q8,  #5
+        vld1.8          {d5},  [r0,:64], r1
+        vrshr.s16       q9,  q9,  #5
+        vld1.8          {d6},  [r0,:64], r1
+        vrshr.s16       q10, q10, #5
+        vaddw.u8        q8,  q8,  d4
+        vld1.8          {d7},  [r0,:64], r1
+        vrshr.s16       q11, q11, #5
+        vaddw.u8        q9,  q9,  d5
+        vld1.8          {d8},  [r0,:64], r1
+        vrshr.s16       q12, q12, #5
+        vaddw.u8        q10, q10, d6
+        vqmovun.s16     d4,  q8
+        vld1.8          {d9},  [r0,:64], r1
+        vrshr.s16       q13, q13, #5
+        vaddw.u8        q11, q11, d7
+        vqmovun.s16     d5,  q9
+        vld1.8          {d10}, [r0,:64], r1
+        vrshr.s16       q14, q14, #5
+        vaddw.u8        q12, q12, d8
+        vqmovun.s16     d6,  q10
+        vld1.8          {d11}, [r0,:64], r1
+        vrshr.s16       q15, q15, #5
+        vaddw.u8        q13, q13, d9
+        vqmovun.s16     d7,  q11
+
+
+        vst1.8          {d4},  [r3,:64], r1
+        vaddw.u8        q14, q14, d10
+        vst1.8          {d5},  [r3,:64], r1
+        vqmovun.s16     d8,  q12
+        vst1.8          {d6},  [r3,:64], r1
+        vaddw.u8        q15, q15, d11
+        vst1.8          {d7},  [r3,:64], r1
+        vqmovun.s16     d9,  q13
+        vst1.8          {d8},  [r3,:64], r1
+        vqmovun.s16     d10, q14
+        vst1.8          {d9},  [r3,:64], r1
+        vqmovun.s16     d11, q15
+
+        vst1.8          {d10}, [r3,:64], r1
+        vst1.8          {d11}, [r3,:64], r1
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        vpop            {q4-q5}
+.else
+        vpop            {q4-q7}
+.endif
+        bx              lr
+endfunc
+.endm
+
+itxfm_func8x8 idct,  idct
+itxfm_func8x8 iadst, idct
+.ltorg
+itxfm_func8x8 idct,  iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {d0}, [r12,:64]
+
+        vmov.i16        q2,  #0
+
+        vld1.16         {d16[]}, [r2,:16]
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vdup.16         q8,  d16[0]
+        vst1.16         {d4[0]}, [r2,:16]
+
+        vrshr.s16       q8,  q8,  #6
+
+        mov             r3,  r0
+        mov             r12, #16
+1:
+        @ Loop to add the constant from q8 into all 16x16 outputs
+        subs            r12, r12, #2
+        vld1.8          {q2},  [r0,:128], r1
+        vaddw.u8        q10, q8,  d4
+        vld1.8          {q3},  [r0,:128], r1
+        vaddw.u8        q11, q8,  d5
+        vaddw.u8        q12, q8,  d6
+        vaddw.u8        q13, q8,  d7
+        vqmovun.s16     d4,  q10
+        vqmovun.s16     d5,  q11
+        vqmovun.s16     d6,  q12
+        vst1.8          {q2},  [r3,:128], r1
+        vqmovun.s16     d7,  q13
+        vst1.8          {q3},  [r3,:128], r1
+        bne             1b
+
+        bx              lr
+endfunc
+.ltorg
+
+.macro idct16_end
+        butterfly       d18, d7,  d4,  d7                @ d18 = t0a,  d7  = t7a
+        butterfly       d19, d22, d5,  d22               @ d19 = t1a,  d22 = t6
+        butterfly       d4,  d26, d20, d26               @ d4  = t2a,  d26 = t5
+        butterfly       d5,  d6,  d28, d6                @ d5  = t3a,  d6  = t4
+        butterfly       d20, d28, d16, d24               @ d20 = t8a,  d28 = t11a
+        butterfly       d24, d21, d23, d21               @ d24 = t9,   d21 = t10
+        butterfly       d23, d27, d25, d27               @ d23 = t14,  d27 = t13
+        butterfly       d25, d29, d29, d17               @ d25 = t15a, d29 = t12a
+
+        mbutterfly0     d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
+        mbutterfly0     d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12,  d28 = t11
+
+        vswp            d27, d29                         @ d27 = t12, d29 = t13a
+        vswp            d28, d27                         @ d28 = t12, d27 = t11
+        butterfly       d16, d31, d18, d25               @ d16 = out[0], d31 = out[15]
+        butterfly       d17, d30, d19, d23               @ d17 = out[1], d30 = out[14]
+        butterfly_r     d25, d22, d22, d24               @ d25 = out[9], d22 = out[6]
+        butterfly       d23, d24, d7,  d20               @ d23 = out[7], d24 = out[8]
+        butterfly       d18, d29, d4,  d29               @ d18 = out[2], d29 = out[13]
+        butterfly       d19, d28, d5,  d28               @ d19 = out[3], d28 = out[12]
+        vmov            d4,  d21                         @ d4  = t10a
+        butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 = out[11]
+        butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 = out[10]
+        bx              lr
+.endm
+
+function idct16
+        mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
+        mbutterfly      d20, d28, d0[2], d0[3], q2,  q3  @ d20 = t2a,  d28 = t3a
+        mbutterfly      d18, d30, d1[0], d1[1], q2,  q3  @ d18 = t4a,  d30 = t7a
+        mbutterfly      d26, d22, d1[2], d1[3], q2,  q3  @ d26 = t5a,  d22 = t6a
+        mbutterfly      d17, d31, d2[0], d2[1], q2,  q3  @ d17 = t8a,  d31 = t15a
+        mbutterfly      d25, d23, d2[2], d2[3], q2,  q3  @ d25 = t9a,  d23 = t14a
+        mbutterfly      d21, d27, d3[0], d3[1], q2,  q3  @ d21 = t10a, d27 = t13a
+        mbutterfly      d29, d19, d3[2], d3[3], q2,  q3  @ d29 = t11a, d19 = t12a
+
+        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
+        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
+        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
+        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
+        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
+        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
+        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
+        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
+
+        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
+        mbutterfly      d23, d25, d0[2], d0[3], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d0[2], d0[3], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        idct16_end
+endfunc
+
+function idct16_half
+        mbutterfly0_h   d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
+        mbutterfly_h1   d20, d28, d0[2], d0[3], q2,  q3  @ d20 = t2a,  d28 = t3a
+        mbutterfly_h1   d18, d30, d1[0], d1[1], q2,  q3  @ d18 = t4a,  d30 = t7a
+        mbutterfly_h2   d26, d22, d1[2], d1[3], q2,  q3  @ d26 = t5a,  d22 = t6a
+        mbutterfly_h1   d17, d31, d2[0], d2[1], q2,  q3  @ d17 = t8a,  d31 = t15a
+        mbutterfly_h2   d25, d23, d2[2], d2[3], q2,  q3  @ d25 = t9a,  d23 = t14a
+        mbutterfly_h1   d21, d27, d3[0], d3[1], q2,  q3  @ d21 = t10a, d27 = t13a
+        mbutterfly_h2   d29, d19, d3[2], d3[3], q2,  q3  @ d29 = t11a, d19 = t12a
+
+        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
+        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
+        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
+        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
+        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
+        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
+        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
+        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
+
+        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
+        mbutterfly      d23, d25, d0[2], d0[3], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d0[2], d0[3], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        vmull.s16       q12, d19, d3[3]
+        vmull.s16       q2,  d17, d2[0]
+        vmull.s16       q3,  d18, d1[1]
+        vmull.s16       q15, d18, d1[0]
+        vneg.s32        q12, q12
+        vmull.s16       q14, d17, d2[1]
+        vmull.s16       q13, d19, d3[2]
+        vmull.s16       q11, d16, d0[0]
+        vrshrn.s32      d24, q12, #14
+        vrshrn.s32      d16, q2,  #14
+        vrshrn.s32      d7,  q3,  #14
+        vrshrn.s32      d6,  q15, #14
+        vrshrn.s32      d29, q14, #14
+        vrshrn.s32      d17, q13, #14
+        vrshrn.s32      d28, q11, #14
+
+        mbutterfly_l    q10, q11, d17, d24, d0[2], d0[3]
+        mbutterfly_l    q9,  q15, d29, d16, d0[2], d0[3]
+        vneg.s32        q11, q11
+        vrshrn.s32      d27, q10, #14
+        vrshrn.s32      d21, q11, #14
+        vrshrn.s32      d23, q9,  #14
+        vrshrn.s32      d25, q15, #14
+        vmov            d4,  d28
+        vmov            d5,  d28
+        mbutterfly0     d22, d26, d7,  d6,  d18, d30, q9,  q15
+        vmov            d20, d28
+        idct16_end
+endfunc
+
+function iadst16
+        movrel          r12, iadst16_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+
+        mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = t0
+        mbutterfly_l    q5,  q4,  d23, d24, d1[1], d1[0] @ q5  = t9,   q4  = t8
+        butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = t9a
+        mbutterfly_l    q7,  q6,  d29, d18, d0[3], d0[2] @ q7  = t3,   q6  = t2
+        butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = t8a
+
+        mbutterfly_l    q3,  q2,  d21, d26, d1[3], d1[2] @ q3  = t11,  q2  = t10
+        butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = t11a
+        mbutterfly_l    q5,  q4,  d27, d20, d2[1], d2[0] @ q5  = t5,   q4  = t4
+        butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = t10a
+
+        mbutterfly_l    q7,  q6,  d19, d28, d3[1], d3[0] @ q7  = t13,  q6  = t12
+        butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = t13a
+        mbutterfly_l    q3,  q2,  d25, d22, d2[3], d2[2] @ q3  = t7,   q2  = t6
+        butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = t12a
+
+        mbutterfly_l    q5,  q4,  d17, d30, d3[3], d3[2] @ q5  = t15,  q4  = t14
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+        butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = t15a
+        mbutterfly_l    q7,  q6,  d23, d24, d1[0], d1[1] @ q7  = t9,   q6  = t8
+        butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = t14a
+
+        mbutterfly_l    q2,  q3,  d28, d19, d1[1], d1[0] @ q2  = t12,  q3  = t13
+        butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = t12a
+        mbutterfly_l    q5,  q4,  d21, d26, d1[2], d1[3] @ q5  = t11,  q4  = t10
+        butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = t0
+        butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = t13a
+
+        mbutterfly_l    q6,  q7,  d30, d17, d1[3], d1[2] @ q6  = t14,  q7  = t15
+        butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = t1
+        butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = t14a
+        butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = t15a
+
+        butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = t2
+        butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = t3
+
+        mbutterfly_l    q5,  q4,  d19, d28, d0[2], d0[3] @ q5  = t13,  q4  = t12
+        mbutterfly_l    q6,  q7,  d30, d17, d0[3], d0[2] @ q6  = t14,  q7  = t15
+
+        butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   d30 = t14a
+        butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], d17 = t15a
+        vneg.s16        d29, d29                         @ d29 = out[13]
+
+        mbutterfly_l    q5,  q4,  d4,  d5,  d0[2], d0[3] @ q5  = t5a,  q4  = t4a
+        mbutterfly_l    q6,  q7,  d7,  d6,  d0[3], d0[2] @ q6  = t6a,  q7  = t7a
+
+        butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = t2a
+        butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = t10
+
+        butterfly_n     d19, d31, q4,  q6,  q2,  q4      @ d19 = -out[3],  d31 = t6
+        vneg.s16        d19, d19                         @ d19 = out[3]
+        butterfly_n     d28, d16, q5,  q7,  q2,  q5      @ d28 = out[12],  d16 = t7
+
+        butterfly       d5,  d8,  d20, d22               @ d5 =-out[15],d8 = t3a
+        butterfly       d4,  d9,  d24, d26               @ d4 = out[14],d9 = t11
+
+        mbutterfly0     d23, d24, d6,  d8,  d10, d11, q6,  q7, 1 @ d23 = out[7], d24 = out[8]
+        mbutterfly0     d20, d27, d16, d31, d10, d11, q6,  q7    @ d20 = out[4], d27 = out[11]
+        mbutterfly0     d22, d25, d9,  d7,  d10, d11, q6,  q7    @ d22 = out[6], d25 = out[9]
+        mbutterfly0     d21, d26, d30, d17, d10, d11, q6,  q7, 1 @ d21 = out[5], d26 = out[10]
+
+        vneg.s16        d31, d5                          @ d31 = out[15]
+        vneg.s16        d17, d3                          @ d17 = out[1]
+
+        vmov            d16, d2
+        vmov            d30, d4
+        bx              lr
+endfunc
+
+.macro load_add_store coef0, coef1, coef2, coef3
+        vrshr.s16       \coef0, \coef0, #6
+        vrshr.s16       \coef1, \coef1, #6
+
+        vld1.32         {d4[]},   [r0,:32], r1
+        vld1.32         {d4[1]},  [r3,:32], r1
+        vrshr.s16       \coef2, \coef2, #6
+        vrshr.s16       \coef3, \coef3, #6
+        vld1.32         {d5[]},   [r0,:32], r1
+        vld1.32         {d5[1]},  [r3,:32], r1
+        vaddw.u8        \coef0, \coef0, d4
+        vld1.32         {d6[]},   [r0,:32], r1
+        vld1.32         {d6[1]},  [r3,:32], r1
+        vaddw.u8        \coef1, \coef1, d5
+        vld1.32         {d7[]},   [r0,:32], r1
+        vld1.32         {d7[1]},  [r3,:32], r1
+
+        vqmovun.s16     d4,  \coef0
+        vqmovun.s16     d5,  \coef1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r3,  r3,  r1, lsl #2
+        vaddw.u8        \coef2, \coef2, d6
+        vaddw.u8        \coef3, \coef3, d7
+        vst1.32         {d4[0]},  [r0,:32], r1
+        vst1.32         {d4[1]},  [r3,:32], r1
+        vqmovun.s16     d6,  \coef2
+        vst1.32         {d5[0]},  [r0,:32], r1
+        vst1.32         {d5[1]},  [r3,:32], r1
+        vqmovun.s16     d7,  \coef3
+
+        vst1.32         {d6[0]},  [r0,:32], r1
+        vst1.32         {d6[1]},  [r3,:32], r1
+        vst1.32         {d7[0]},  [r0,:32], r1
+        vst1.32         {d7[1]},  [r3,:32], r1
+.endm
+
+.macro itxfm16_1d_funcs txfm
+@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+@ transpose into a horizontal 16x4 slice and store.
+@ r0 = dst (temp buffer)
+@ r1 = slice offset
+@ r2 = src
+function \txfm\()16_1d_4x16_pass1_neon
+        push            {lr}
+
+        mov             r12, #32
+        vmov.s16        q2,  #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+
+        bl              \txfm\()16
+
+        @ Do four 4x4 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+        @ contain the transposed 4x4 blocks.
+        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 4x4 blocks horizontally.
+        cmp             r1,  #12
+        beq             1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        pop             {pc}
+1:
+        @ Special case: For the last input column (r1 == 12),
+        @ which would be stored as the last row in the temp buffer,
+        @ don't store the first 4x4 block, but keep it in registers
+        @ for the first slice of the second pass (where it is the
+        @ last 4x4 block).
+        add             r0,  r0,  #8
+        vst1.16         {d20}, [r0,:64]!
+        vst1.16         {d24}, [r0,:64]!
+        vst1.16         {d28}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d21}, [r0,:64]!
+        vst1.16         {d25}, [r0,:64]!
+        vst1.16         {d29}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d22}, [r0,:64]!
+        vst1.16         {d26}, [r0,:64]!
+        vst1.16         {d30}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d23}, [r0,:64]!
+        vst1.16         {d27}, [r0,:64]!
+        vst1.16         {d31}, [r0,:64]!
+        vmov            d28, d16
+        vmov            d29, d17
+        vmov            d30, d18
+        vmov            d31, d19
+        pop             {pc}
+endfunc
+
+@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+@ load the destination pixels (from a similar 4x16 slice), add and store back.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+@ r3 = slice offset
+function \txfm\()16_1d_4x16_pass2_neon
+        push            {lr}
+        mov             r12, #32
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        cmp             r3,  #0
+        beq             1f
+.irp i, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+1:
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        bl              \txfm\()16
+
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+
+        pop             {pc}
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+@ This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+        .short  0, 10, 38, 89
+endconst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #1
+        beq             idct16x16_dc_add_neon
+.endif
+        push            {r4-r8,lr}
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpush           {q4-q7}
+.endif
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #512
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+.ifc \txfm1,idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             r3,  #10
+        ble             idct16x16_quarter_add_neon
+        cmp             r3,  #38
+        ble             idct16x16_half_add_neon
+
+        movrel          r8,  min_eob_idct_idct_16 + 2
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             r0,  sp,  #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(16 - \i)/4
+        ble             1f
+.endif
+.endif
+        mov             r1,  #\i
+        add             r2,  r6,  #(\i*2)
+        bl              \txfm1\()16_1d_4x16_pass1_neon
+.endr
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        @ For all-zero slices in pass 1, set d28-d31 to zero, for the in-register
+        @ passthrough of coefficients to pass 2 and clear the end of the temp buffer
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+2:
+        subs            r1,  r1,  #1
+.rept 4
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
+.endif
+
+.ifc \txfm1\()_\txfm2,iadst_idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+.endif
+.irp i, 0, 4, 8, 12
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        mov             r3,  #\i
+        bl              \txfm2\()16_1d_4x16_pass2_neon
+.endr
+
+        add             sp,  sp,  r7
+.ifnc \txfm1\()_\txfm2,idct_idct
+        vpop            {q4-q7}
+.endif
+        pop             {r4-r8,pc}
+endfunc
+.endm
+
+itxfm_func16x16 idct,  idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct,  iadst
+itxfm_func16x16 iadst, iadst
+.ltorg
+
+function idct16_1d_4x16_pass1_quarter_neon
+        push            {lr}
+        mov             r12, #32
+        vmov.s16        q2, #0
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+
+        bl              idct16_quarter
+
+        @ Do four 4x4 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+        @ contain the transposed 4x4 blocks.
+        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 4x4 blocks horizontally.
+        @ The first 4x4 block is kept in registers for the second pass,
+        @ store the rest in the temp buffer.
+        add             r0,  r0,  #8
+        vst1.16         {d20}, [r0,:64]!
+        vst1.16         {d24}, [r0,:64]!
+        vst1.16         {d28}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d21}, [r0,:64]!
+        vst1.16         {d25}, [r0,:64]!
+        vst1.16         {d29}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d22}, [r0,:64]!
+        vst1.16         {d26}, [r0,:64]!
+        vst1.16         {d30}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d23}, [r0,:64]!
+        vst1.16         {d27}, [r0,:64]!
+        vst1.16         {d31}, [r0,:64]!
+        pop             {pc}
+endfunc
+
+function idct16_1d_4x16_pass2_quarter_neon
+        push            {lr}
+        @ Only load the top 4 lines, and only do it for the later slices.
+        @ For the first slice, d16-d19 is kept in registers from the first pass.
+        cmp             r3,  #0
+        beq             1f
+        mov             r12, #32
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+1:
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        bl              idct16_quarter
+
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+
+        pop             {pc}
+endfunc
+
+function idct16_1d_4x16_pass1_half_neon
+        push            {lr}
+        mov             r12, #32
+        vmov.s16        q2, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+
+        bl              idct16_half
+
+        @ Do four 4x4 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+        @ contain the transposed 4x4 blocks.
+        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 4x4 blocks horizontally.
+        cmp             r1,  #4
+        beq             1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        pop             {pc}
+1:
+        @ Special case: For the second input column (r1 == 4),
+        @ which would be stored as the second row in the temp buffer,
+        @ don't store the first 4x4 block, but keep it in registers
+        @ for the first slice of the second pass (where it is the
+        @ second 4x4 block).
+        add             r0,  r0,  #8
+        vst1.16         {d20}, [r0,:64]!
+        vst1.16         {d24}, [r0,:64]!
+        vst1.16         {d28}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d21}, [r0,:64]!
+        vst1.16         {d25}, [r0,:64]!
+        vst1.16         {d29}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d22}, [r0,:64]!
+        vst1.16         {d26}, [r0,:64]!
+        vst1.16         {d30}, [r0,:64]!
+        add             r0,  r0,  #8
+        vst1.16         {d23}, [r0,:64]!
+        vst1.16         {d27}, [r0,:64]!
+        vst1.16         {d31}, [r0,:64]!
+        vmov            d20, d16
+        vmov            d21, d17
+        vmov            d22, d18
+        vmov            d23, d19
+        pop             {pc}
+endfunc
+
+function idct16_1d_4x16_pass2_half_neon
+        push            {lr}
+        mov             r12, #32
+        cmp             r3,  #0
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        beq             1f
+.irp i, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+1:
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        bl              idct16_half
+
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+
+        pop             {pc}
+endfunc
+.purgem load_add_store
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_neon
+        add             r0,  sp,  #(0*32)
+        mov             r1,  #0
+        add             r2,  r6,  #(0*2)
+        bl              idct16_1d_4x16_pass1_\size\()_neon
+.ifc \size,half
+        add             r0,  sp,  #(4*32)
+        mov             r1,  #4
+        add             r2,  r6,  #(4*2)
+        bl              idct16_1d_4x16_pass1_\size\()_neon
+.endif
+.irp i, 0, 4, 8, 12
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        mov             r3,  #\i
+        bl              idct16_1d_4x16_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  r7
+        pop             {r4-r8,pc}
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {d0}, [r12,:64]
+
+        vmov.i16        q2,  #0
+
+        vld1.16         {d16[]}, [r2,:16]
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vdup.16         q8,  d16[0]
+        vst1.16         {d4[0]}, [r2,:16]
+
+        vrshr.s16       q8,  q8,  #6
+
+        mov             r3,  r0
+        mov             r12, #32
+1:
+        @ Loop to add the constant from q8 into all 32x32 outputs
+        subs            r12, r12, #2
+        vld1.8          {q0-q1},  [r0,:128], r1
+        vaddw.u8        q9,  q8,  d0
+        vaddw.u8        q10, q8,  d1
+        vld1.8          {q2-q3},  [r0,:128], r1
+        vaddw.u8        q11, q8,  d2
+        vaddw.u8        q12, q8,  d3
+        vaddw.u8        q13, q8,  d4
+        vaddw.u8        q14, q8,  d5
+        vaddw.u8        q15, q8,  d6
+        vqmovun.s16     d0,  q9
+        vaddw.u8        q9,  q8,  d7
+        vqmovun.s16     d1,  q10
+        vqmovun.s16     d2,  q11
+        vqmovun.s16     d3,  q12
+        vqmovun.s16     d4,  q13
+        vqmovun.s16     d5,  q14
+        vst1.8          {q0-q1},  [r3,:128], r1
+        vqmovun.s16     d6,  q15
+        vqmovun.s16     d7,  q9
+        vst1.8          {q2-q3},  [r3,:128], r1
+        bne             1b
+
+        bx              lr
+endfunc
+
+.macro idct32_end
+        butterfly       d16, d9,  d8,  d9  @ d16 = t16a, d9  = t19a
+        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
+        butterfly       d18, d10, d11, d10 @ d18 = t23a, d10 = t20a
+        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
+        butterfly       d8,  d28, d28, d30 @ d8  = t24a, d28 = t27a
+        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
+        butterfly       d11, d29, d29, d31 @ d11 = t31a, d29 = t28a
+        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
+
+        mbutterfly      d27, d20, d0[2], d0[3], q12, q15        @ d27 = t18a, d20 = t29a
+        mbutterfly      d29, d9,  d0[2], d0[3], q12, q15        @ d29 = t19,  d5  = t28
+        mbutterfly      d28, d10, d0[2], d0[3], q12, q15, neg=1 @ d28 = t27,  d6  = t20
+        mbutterfly      d26, d21, d0[2], d0[3], q12, q15, neg=1 @ d26 = t26a, d21 = t21a
+
+        butterfly       d31, d24, d11, d8  @ d31 = t31,  d24 = t24
+        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
+        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
+        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
+        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
+        butterfly_r     d27, d28, d9,  d28 @ d27 = t27a, d28 = t28a
+        butterfly       d8,  d26, d20, d26 @ d8  = t29,  d26 = t26
+        butterfly       d19, d20, d29, d10 @ d19 = t19a, d20 = t20
+        vmov            d29, d8            @ d29 = t29
+
+        mbutterfly0     d27, d20, d27, d20, d8, d10, q4, q5 @ d27 = t27,  d20 = t20
+        mbutterfly0     d26, d21, d26, d21, d8, d10, q4, q5 @ d26 = t26a, d21 = t21a
+        mbutterfly0     d25, d22, d25, d22, d8, d10, q4, q5 @ d25 = t25,  d22 = t22
+        mbutterfly0     d24, d23, d24, d23, d8, d10, q4, q5 @ d24 = t24a, d23 = t23a
+        bx              lr
+.endm
+
+function idct32_odd
+        mbutterfly      d16, d31, d4[0], d4[1], q4, q5 @ d16 = t16a, d31 = t31a
+        mbutterfly      d24, d23, d4[2], d4[3], q4, q5 @ d24 = t17a, d23 = t30a
+        mbutterfly      d20, d27, d5[0], d5[1], q4, q5 @ d20 = t18a, d27 = t29a
+        mbutterfly      d28, d19, d5[2], d5[3], q4, q5 @ d28 = t19a, d19 = t28a
+        mbutterfly      d18, d29, d6[0], d6[1], q4, q5 @ d18 = t20a, d29 = t27a
+        mbutterfly      d26, d21, d6[2], d6[3], q4, q5 @ d26 = t21a, d21 = t26a
+        mbutterfly      d22, d25, d7[0], d7[1], q4, q5 @ d22 = t22a, d25 = t25a
+        mbutterfly      d30, d17, d7[2], d7[3], q4, q5 @ d30 = t23a, d17 = t24a
+
+        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
+        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
+        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
+        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
+        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
+        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
+        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
+        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+        mbutterfly      d23, d24, d1[0], d1[1], q8, q9        @ d23 = t17a, d24 = t30a
+        mbutterfly      d27, d20, d1[0], d1[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d1[2], d1[3], q8, q9        @ d21 = t21a, d26 = t26a
+        mbutterfly      d25, d22, d1[2], d1[3], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_half
+        mbutterfly_h1   d16, d31, d4[0], d4[1], q4, q5 @ d16 = t16a, d31 = t31a
+        mbutterfly_h2   d24, d23, d4[2], d4[3], q4, q5 @ d24 = t17a, d23 = t30a
+        mbutterfly_h1   d20, d27, d5[0], d5[1], q4, q5 @ d20 = t18a, d27 = t29a
+        mbutterfly_h2   d28, d19, d5[2], d5[3], q4, q5 @ d28 = t19a, d19 = t28a
+        mbutterfly_h1   d18, d29, d6[0], d6[1], q4, q5 @ d18 = t20a, d29 = t27a
+        mbutterfly_h2   d26, d21, d6[2], d6[3], q4, q5 @ d26 = t21a, d21 = t26a
+        mbutterfly_h1   d22, d25, d7[0], d7[1], q4, q5 @ d22 = t22a, d25 = t25a
+        mbutterfly_h2   d30, d17, d7[2], d7[3], q4, q5 @ d30 = t23a, d17 = t24a
+
+        butterfly       d8,  d24, d16, d24 @ d8  = t16, d24 = t17
+        butterfly       d9,  d20, d28, d20 @ d9  = t19, d20 = t18
+        butterfly       d10, d26, d18, d26 @ d10 = t20, d26 = t21
+        butterfly       d11, d22, d30, d22 @ d11 = t23, d22 = t22
+        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
+        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
+        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
+        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+        mbutterfly      d23, d24, d1[0], d1[1], q8, q9        @ d23 = t17a, d24 = t30a
+        mbutterfly      d27, d20, d1[0], d1[1], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d1[2], d1[3], q8, q9        @ d21 = t21a, d26 = t26a
+        mbutterfly      d25, d22, d1[2], d1[3], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        vmull.s16       q4,  d16, d4[0]
+        vmull.s16       q14, d19, d5[3]
+        vmull.s16       q15, d16, d4[1]
+        vmull.s16       q11, d17, d7[2]
+        vmull.s16       q5,  d17, d7[3]
+        vmull.s16       q13, d19, d5[2]
+        vmull.s16       q10, d18, d6[0]
+        vmull.s16       q12, d18, d6[1]
+
+        vneg.s32        q14, q14
+        vneg.s32        q5,  q5
+
+        vrshrn.s32      d8,  q4,  #14
+        vrshrn.s32      d9,  q14, #14
+        vrshrn.s32      d29, q15, #14
+        vrshrn.s32      d28, q11, #14
+        vrshrn.s32      d11, q5,  #14
+        vrshrn.s32      d31, q13, #14
+        vrshrn.s32      d10, q10, #14
+        vrshrn.s32      d30, q12, #14
+
+        mbutterfly_l    q8,  q9,  d29, d8,  d1[0], d1[1]
+        mbutterfly_l    q13, q10, d31, d9,  d1[0], d1[1]
+        vrshrn.s32      d23, q8,  #14
+        vrshrn.s32      d24, q9,  #14
+        vneg.s32        q10, q10
+        vrshrn.s32      d27, q13, #14
+        vrshrn.s32      d20, q10, #14
+        mbutterfly_l    q8,  q9,  d30, d10, d1[2], d1[3]
+        vrshrn.s32      d21, q8,  #14
+        vrshrn.s32      d26, q9,  #14
+        mbutterfly_l    q8,  q9,  d28, d11, d1[2], d1[3]
+        vrshrn.s32      d25, q8,  #14
+        vneg.s32        q9,  q9
+        vrshrn.s32      d22, q9,  #14
+
+        idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
+@ We don't have register space to do a single pass IDCT of 4x32 though,
+@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
+@ a normal IDCT16 with every other input component (the even ones, with
+@ each output written twice), followed by a separate 16-point IDCT
+@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
+@ r0 = dst (temp buffer)
+@ r1 = unused
+@ r2 = src
+function idct32_1d_4x32_pass1\suffix\()_neon
+        push            {lr}
+
+        @ idct16 clobbers q2-q3 (since it doesn't clobber q4-q7 at all
+        @ when doing the normal 16x16 idct), so move the idct32_odd coeffs
+        @ to q4-q5
+        vmov            q4,  q2
+        vmov            q5,  q3
+
+        @ Double stride of the input, since we only read every other line
+        mov             r12, #128
+        vmov.s16        d4,  #0
+
+        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+.endif
+
+        bl              idct16\suffix
+
+        @ Move the idct32_odd coeffs back into q2-q3 for idct32_odd;
+        @ the constants for a vmul with a lane must be in q0-q3.
+        vmov            q2,  q4
+        vmov            q3,  q5
+
+        @ Do four 4x4 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+        @ contain the transposed 4x4 blocks.
+        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        @ Store the registers a, b, c, d horizontally, followed
+        @ by the same registers d, c, b, a mirrored.
+.macro store_rev a, b, c, d
+.irp i, \a, \b, \c, \d
+        vst1.16         {d\i}, [r0,:64]!
+        vrev64.16       d\i, d\i
+.endr
+.irp i, \d, \c, \b, \a
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+.endm
+        store_rev       16, 20, 24, 28
+        store_rev       17, 21, 25, 29
+        store_rev       18, 22, 26, 30
+        store_rev       19, 23, 27, 31
+        sub             r0,  r0,  #256
+.purgem store_rev
+
+        @ Move r2 back to the start of the input, and move
+        @ to the first odd row
+.ifb \suffix
+        sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             r2,  r2,  r12, lsl #3
+.endif
+        add             r2,  r2,  #64
+
+        vmov.s16        d8,  #0
+        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d8},  [r2,:64], r12
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d8},  [r2,:64], r12
+.endr
+.endif
+
+        bl              idct32_odd\suffix
+
+        transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+
+        @ Store the registers a, b, c, d horizontally,
+        @ adding into the output first, and then mirrored, subtracted
+        @ from the output.
+.macro store_rev a, b, c, d
+.irp i, \a, \b, \c, \d
+        vld1.16         {d8},  [r0,:64]
+        vadd.s16        d8,  d8,  d\i
+        vst1.16         {d8},  [r0,:64]!
+        vrev64.16       d\i, d\i
+.endr
+.irp i, \d, \c, \b, \a
+        vld1.16         {d8},  [r0,:64]
+        vsub.s16        d8,  d8,  d\i
+        vst1.16         {d8},  [r0,:64]!
+.endr
+.endm
+
+        store_rev       31, 27, 23, 19
+        store_rev       30, 26, 22, 18
+        store_rev       29, 25, 21, 17
+        store_rev       28, 24, 20, 16
+.purgem store_rev
+        pop             {pc}
+endfunc
+.ltorg
+
+@ This is mostly the same as 4x32_pass1, but without the transpose,
+@ and use the source as temp buffer between the two idct passes, and
+@ add into the destination.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+function idct32_1d_4x32_pass2\suffix\()_neon
+        push            {lr}
+        vmov            q4,  q2
+        vmov            q5,  q3
+
+        mov             r12, #128
+        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #3
+.endif
+
+        bl              idct16\suffix
+
+        vmov            q2,  q4
+        vmov            q3,  q5
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vst1.16         {d\i}, [r2,:64], r12
+.endr
+
+        sub             r2,  r2,  r12, lsl #4
+        add             r2,  r2,  #64
+
+        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #3
+.endif
+        sub             r2,  r2,  #64
+
+        bl              idct32_odd\suffix
+
+        mov             r12, #128
+.macro load_acc_store a, b, c, d, neg=0
+        vld1.16         {d8},  [r2,:64], r12
+        vld1.16         {d9},  [r2,:64], r12
+.if \neg == 0
+        vadd.s16        d8,  d8,  d\a
+        vld1.16         {d10}, [r2,:64], r12
+        vadd.s16        d9,  d9,  d\b
+        vld1.16         {d11}, [r2,:64], r12
+        vadd.s16        d10, d10, d\c
+        vadd.s16        d11, d11, d\d
+.else
+        vsub.s16        d8,  d8,  d\a
+        vld1.16         {d10}, [r2,:64], r12
+        vsub.s16        d9,  d9,  d\b
+        vld1.16         {d11}, [r2,:64], r12
+        vsub.s16        d10, d10, d\c
+        vsub.s16        d11, d11, d\d
+.endif
+        vld1.32         {d12[]},  [r0,:32], r1
+        vld1.32         {d12[1]}, [r0,:32], r1
+        vrshr.s16       q4,  q4,  #6
+        vld1.32         {d13[]},  [r0,:32], r1
+        vrshr.s16       q5,  q5,  #6
+        vld1.32         {d13[1]}, [r0,:32], r1
+        sub             r0,  r0,  r1, lsl #2
+        vaddw.u8        q4,  q4,  d12
+        vaddw.u8        q5,  q5,  d13
+        vqmovun.s16     d8,  q4
+        vqmovun.s16     d9,  q5
+        vst1.32         {d8[0]},  [r0,:32], r1
+        vst1.32         {d8[1]},  [r0,:32], r1
+        vst1.32         {d9[0]},  [r0,:32], r1
+        vst1.32         {d9[1]},  [r0,:32], r1
+.endm
+        load_acc_store  31, 30, 29, 28
+        load_acc_store  27, 26, 25, 24
+        load_acc_store  23, 22, 21, 20
+        load_acc_store  19, 18, 17, 16
+        sub             r2,  r2,  r12
+        neg             r12, r12
+        load_acc_store  16, 17, 18, 19, 1
+        load_acc_store  20, 21, 22, 23, 1
+        load_acc_store  24, 25, 26, 27, 1
+        load_acc_store  28, 29, 30, 31, 1
+.purgem load_acc_store
+        pop             {pc}
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+        .short  0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
+function ff_vp9_idct_idct_32x32_add_neon, export=1
+        cmp             r3,  #1
+        beq             idct32x32_dc_add_neon
+        push            {r4-r8,lr}
+        vpush           {q4-q6}
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #2048
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]!
+        vld1.16         {q2-q3}, [r12,:128]
+
+        cmp             r3,  #34
+        ble             idct32x32_quarter_add_neon
+        cmp             r3,  #135
+        ble             idct32x32_half_add_neon
+
+        movrel          r8,  min_eob_idct_idct_32 + 2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r0,  sp,  #(\i*64)
+.if \i > 0
+        ldrh_post       r1,  r8,  #2
+        cmp             r3,  r1
+        it              le
+        movle           r1,  #(32 - \i)/2
+        ble             1f
+.endif
+        add             r2,  r6,  #(\i*2)
+        bl              idct32_1d_4x32_pass1_neon
+.endr
+        b               3f
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+2:
+        subs            r1,  r1,  #1
+.rept 4
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bne             2b
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        bl              idct32_1d_4x32_pass2_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q6}
+        pop             {r4-r8,pc}
+endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_neon
+.irp i, 0, 4
+        add             r0,  sp,  #(\i*64)
+.ifc \size,quarter
+.if \i == 4
+        cmp             r3,  #9
+        ble             1f
+.endif
+.endif
+        add             r2,  r6,  #(\i*2)
+        bl              idct32_1d_4x32_pass1_\size\()_neon
+.endr
+
+.ifc \size,half
+.irp i, 8, 12
+        add             r0,  sp,  #(\i*64)
+.if \i == 12
+        cmp             r3,  #70
+        ble             1f
+.endif
+        add             r2,  r6,  #(\i*2)
+        bl              idct32_1d_4x32_pass1_\size\()_neon
+.endr
+.endif
+        b               3f
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+.rept 8
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        bl              idct32_1d_4x32_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q6}
+        pop             {r4-r8,pc}
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half
diff --git a/libavcodec/arm/vp9lpf_16bpp_neon.S b/libavcodec/arm/vp9lpf_16bpp_neon.S
new file mode 100644
index 0000000000..7d2571dcc0
--- /dev/null
+++ b/libavcodec/arm/vp9lpf_16bpp_neon.S
@@ -0,0 +1,1044 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro transpose16_q_8x8 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        vswp             \r1,  \r8  @ vtrn.64 \rq0, \rq4
+        vswp             \r3,  \r10 @ vtrn.64 \rq1, \rq5
+        vswp             \r5,  \r12 @ vtrn.64 \rq2, \rq6
+        vswp             \r7,  \r14 @ vtrn.64 \rq3, \rq7
+        vtrn.32          \rq0, \rq2
+        vtrn.32          \rq1, \rq3
+        vtrn.32          \rq4, \rq6
+        vtrn.32          \rq5, \rq7
+        vtrn.16          \rq0, \rq1
+        vtrn.16          \rq2, \rq3
+        vtrn.16          \rq4, \rq5
+        vtrn.16          \rq6, \rq7
+.endm
+
+.macro transpose16_4x4 r0, r1, r2, r3
+        vtrn.32          \r0, \r2
+        vtrn.32          \r1, \r3
+        vtrn.16          \r0, \r1
+        vtrn.16          \r2, \r3
+.endm
+
+@ Do a 4x4 transpose, using q registers for the subtransposes that don't
+@ need to address the indiviudal d registers.
+@ r0,r1 == rq0, r2,r3 == rq1
+.macro transpose16_q_4x4 rq0, rq1, r0, r1, r2, r3
+        vtrn.32         \rq0, \rq1
+        vtrn.16         \r0,  \r1
+        vtrn.16         \r2,  \r3
+.endm
+
+@ The input to and output from this macro is in the registers q8-q15,
+@ and q0-q7 are used as scratch registers.
+@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
+.macro loop_filter_q wd
+        vdup.u16        q0,  r2          @ E
+        vdup.u16        q1,  r3          @ I
+
+        vabd.u16        q2,  q8,  q9     @ abs(p3 - p2)
+        vabd.u16        q3,  q9,  q10    @ abs(p2 - p1)
+        vabd.u16        q4,  q10, q11    @ abs(p1 - p0)
+        vabd.u16        q5,  q12, q13    @ abs(q0 - q1)
+        vabd.u16        q6,  q13, q14    @ abs(q1 - q2)
+        vabd.u16        q7,  q14, q15    @ abs(q2 - q3)
+        vmax.u16        q2,  q2,  q3
+        vmax.u16        q3,  q4,  q5
+        vmax.u16        q4,  q6,  q7
+        vabd.u16        q5,  q11, q12    @ abs(p0 - q0)
+        vmax.u16        q2,  q2,  q3
+        vadd.u16        q5,  q5,  q5     @ abs(p0 - q0) * 2
+        vabd.u16        q6,  q10, q13    @ abs(p1 - q1)
+        vmax.u16        q2,  q2,  q4     @ max(abs(p3 - p2), ..., abs(q2 - q3))
+        vshr.u16        q6,  q6,  #1
+        vcle.u16        q2,  q2,  q1     @ max(abs()) <= I
+        vadd.u16        q5,  q5,  q6     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        vcle.u16        q5,  q5,  q0
+        vand            q2,  q2,  q5     @ fm
+
+        vmovn.u16       d10, q2
+        vmov            r8,  r9,  d10
+        orrs            r8,  r8,  r9
+        @ If no pixels need filtering, just exit as soon as possible
+        beq             9f
+
+.if \wd >= 8
+        vdup.u16        q0,  r5
+
+        vabd.u16        q1,  q8,  q11    @ abs(p3 - p0)
+        vabd.u16        q3,  q9,  q11    @ abs(p2 - p0)
+        vabd.u16        q4,  q10, q11    @ abs(p1 - p0)
+        vabd.u16        q5,  q13, q12    @ abs(q1 - q0)
+        vabd.u16        q6,  q14, q12    @ abs(q2 - q0)
+        vabd.u16        q7,  q15, q12    @ abs(q3 - q0)
+        vmax.u16        q1,  q1,  q3
+        vmax.u16        q4,  q4,  q5
+        vmax.u16        q6,  q6,  q7
+        @ The rest of the calculation of flat8in is interleaved below
+.endif
+
+        @ Calculate the normal inner loop filter for 2 or 4 pixels
+        vabd.u16        q3,  q10, q11    @ abs(p1 - p0)
+.if \wd == 8
+        vmax.u16        q1,  q1,  q4
+.endif
+        vabd.u16        q4,  q13, q12    @ abs(q1 - q0)
+.if \wd == 8
+        vmax.u16        q1,  q1,  q6
+.endif
+
+        vsub.u16        q5,  q10, q13    @ p1 - q1
+        vmax.u16        q3,  q3,  q4     @ max(abs(p1 - p0), abs(q1 - q0))
+        vdup.u16        q4,  r4          @ H
+        vsub.u16        q6,  q12, q11    @ q0 - p0
+.if \wd == 8
+        vcle.u16        q1,  q1,  q0     @ flat8in
+.endif
+        vdup.u16        q0,  r6          @ left shift for saturation
+        vcle.u16        q3,  q3,  q4     @ !hev
+.if \wd == 8
+        vand            q1,  q1,  q2     @ flat8in && fm
+.endif
+        vneg.s16        q4,  q0          @ negative left shift after saturation
+        vqshl.s16       q5,  q5,  q0
+.if \wd == 8
+        vbic            q2,  q2,  q1     @ fm && !flat8in
+.endif
+        vmov.s16        q7,  #3
+        vand            q3,  q3,  q2     @ !hev && fm && !flat8in
+        vshl.s16        q5,  q5,  q4     @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+        vmul.s16        q6,  q6,  q7     @ 3 * (q0 - p0)
+        vbic            q5,  q5,  q3     @ if (!hev) av_clip_int2p = 0
+        vadd.s16        q6,  q6,  q5     @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
+        vmov.s16        q5,  #4
+        vqshl.s16       q6,  q6,  q0
+        vmov.s16        q0,  #3
+        vshl.s16        q6,  q6,  q4     @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+        vdup.u16        q4,  r7          @ max pixel value
+
+        vshr.u16        q4,  q4,  #1     @ (1 << (BIT_DEPTH - 1)) - 1)
+
+        vadd.s16        q5,  q6,  q5     @ f + 4
+        vadd.s16        q0,  q6,  q0     @ f + 3
+        vmov.s16        q6,  #0
+        vmin.s16        q5,  q5,  q4     @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+        vmin.s16        q0,  q0,  q4     @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+        vdup.u16        q4,  r7          @ max pixel value
+        vshr.s16        q5,  q5,  #3     @ f1
+        vshr.s16        q0,  q0,  #3     @ f2
+
+        vadd.s16        q0,  q11, q0     @ p0 + f2
+        vsub.s16        q7,  q12, q5     @ q0 - f1
+        vmin.s16        q0,  q0,  q4
+        vmin.s16        q7,  q7,  q4
+        vrshr.s16       q5,  q5,  #1     @ f = (f1 + 1) >> 1
+        vmax.s16        q0,  q0,  q6     @ out p0
+        vmax.s16        q7,  q7,  q6     @ out q0
+        vbit            q11, q0,  q2     @ if (fm && !flat8in)
+        vbit            q12, q7,  q2
+.if \wd >= 8
+        vmovn.u16       d4,  q1
+.endif
+
+        vadd.s16        q0,  q10, q5     @ p1 + f
+        vsub.s16        q7,  q13, q5     @ q1 - f
+.if \wd >= 8
+        vmov            r8,  r9,  d4
+.endif
+        vmin.s16        q0,  q0,  q4
+        vmin.s16        q7,  q7,  q4
+.if \wd >= 8
+        orrs            r8,  r8,  r9
+.endif
+        vmax.s16        q0,  q0,  q6     @ out p1
+        vmax.s16        q7,  q7,  q6     @ out q1
+        vbit            q10, q0,  q3     @ if (!hev && fm && !flat8in)
+        vbit            q13, q7,  q3
+
+.if \wd >= 8
+        @ If no pixels need flat8in, jump to a writeout of the inner 4 pixels
+        beq             6f
+
+        @ flat8in
+        vadd.u16        q2,  q8,  q9
+        vadd.u16        q3,  q10, q13
+        vadd.u16        q4,  q8,  q10
+        vadd.u16        q5,  q11, q14
+        vadd.u16        q0,  q2,  q2
+        vadd.u16        q0,  q0,  q11
+        vadd.u16        q0,  q0,  q12
+        vadd.u16        q0,  q0,  q4
+        vsub.s16        q3,  q3,  q2
+        vsub.s16        q5,  q5,  q4
+        vrshr.u16       q6,  q0,  #3     @ out p2
+
+        vadd.u16        q0,  q0,  q3
+        vadd.u16        q2,  q8,  q11
+        vadd.u16        q3,  q12, q15
+        vrshr.u16       q7,  q0,  #3     @ out p1
+
+        vadd.u16        q0,  q0,  q5
+        vsub.s16        q3,  q3,  q2
+        vadd.u16        q4,  q9,  q12
+        vbit            q9,  q6,  q1
+        vadd.u16        q5,  q13, q15
+        vrshr.u16       q6,  q0,  #3     @ out p0
+
+        vadd.u16        q0,  q0,  q3
+        vsub.s16        q5,  q5,  q4
+        vadd.u16        q2,  q10, q13
+        vbit            q10, q7,  q1
+        vadd.u16        q3,  q14, q15
+        vrshr.u16       q7,  q0,  #3     @ out q0
+
+        vadd.u16        q0,  q0,  q5
+        vsub.s16        q3,  q3,  q2
+        vbit            q11, q6,  q1
+        vrshr.u16       q6,  q0,  #3     @ out q1
+
+        vadd.u16        q0,  q0,  q3
+        vbit            q12, q7,  q1
+        vrshr.u16       q7,  q0,  #3     @ out q2
+        vbit            q13, q6,  q1
+        vbit            q14, q7,  q1
+.endif
+.endm
+
+@ The input to and output from this macro is in the registers d16-d31,
+@ and d0-d7 are used as scratch registers.
+@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
+@ Depending on the width of the loop filter, we either use d16-d19
+@ and d28-d31 as temp registers, or d8-d15.
+@ In practice, this is only ever instantiated once, so the macro parameters
+@ could be hardcoded, but keeping them as is, to keep similarities to the
+@ 8 bpp and aarch64 versions.
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+        vdup.u16        d0,  r2          @ E
+        vdup.u16        d2,  r3          @ I
+
+        vabd.u16        d4,  d20, d21    @ abs(p3 - p2)
+        vabd.u16        d5,  d21, d22    @ abs(p2 - p1)
+        vabd.u16        d6,  d22, d23    @ abs(p1 - p0)
+        vabd.u16        d7,  d24, d25    @ abs(q0 - q1)
+        vabd.u16        \tmp1,  d25, d26 @ abs(q1 - q2)
+        vabd.u16        \tmp2,  d26, d27 @ abs(q2 - q3)
+        vmax.u16        d4,  d4,  d5
+        vmax.u16        d5,  d6,  d7
+        vmax.u16        \tmp1,  \tmp1,  \tmp2
+        vabd.u16        d6,  d23, d24    @ abs(p0 - q0)
+        vmax.u16        d4,  d4,  d5
+        vadd.u16        d6,  d6,  d6     @ abs(p0 - q0) * 2
+        vabd.u16        d5,  d22, d25    @ abs(p1 - q1)
+        vmax.u16        d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
+        vshr.u16        d5,  d5,  #1
+        vcle.u16        d4,  d4,  d2     @ max(abs()) <= I
+        vadd.u16        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        vcle.u16        d6,  d6,  d0
+        vand            d4,  d4,  d6     @ fm
+
+        vdup.u16        d3,  r4          @ H
+        vmov            r8,  r9,  d4
+        orrs            r8,  r8,  r9
+        @ If no pixels need filtering, just exit as soon as possible
+        beq             9f
+
+.if \wd >= 8
+        vdup.u16        d0,  r5
+
+        vabd.u16        d6,  d20, d23    @ abs(p3 - p0)
+        vabd.u16        d2,  d21, d23    @ abs(p2 - p0)
+        vabd.u16        d1,  d22, d23    @ abs(p1 - p0)
+        vabd.u16        \tmp1,  d25, d24 @ abs(q1 - q0)
+        vabd.u16        \tmp2,  d26, d24 @ abs(q2 - q0)
+        vabd.u16        \tmp3,  d27, d24 @ abs(q3 - q0)
+        vmax.u16        d6,  d6,  d2
+        vmax.u16        d1,  d1,  \tmp1
+        vmax.u16        \tmp2,  \tmp2,  \tmp3
+.if \wd == 16
+        vabd.u16        d7,  d16, d23    @ abs(p7 - p0)
+        vmax.u16        d6,  d6,  d1
+        vabd.u16        d2,  d17, d23    @ abs(p6 - p0)
+        vmax.u16        d6,  d6,  \tmp2
+        vabd.u16        d1,  d18, d23    @ abs(p5 - p0)
+        vcle.u16        d6,  d6,  d0     @ flat8in
+        vabd.u16        d8,  d19, d23    @ abs(p4 - p0)
+        vand            d6,  d6,  d4     @ flat8in && fm
+        vabd.u16        d9,  d28, d24    @ abs(q4 - q0)
+        vbic            d4,  d4,  d6     @ fm && !flat8in
+        vabd.u16        d10, d29, d24    @ abs(q5 - q0)
+        vabd.u16        d11, d30, d24    @ abs(q6 - q0)
+        vabd.u16        d12, d31, d24    @ abs(q7 - q0)
+
+        vmax.u16        d7,  d7,  d2
+        vmax.u16        d1,  d1,  d8
+        vmax.u16        d9,  d9,  d10
+        vmax.u16        d11, d11, d12
+        @ The rest of the calculation of flat8out is interleaved below
+.else
+        @ The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+        @ Calculate the normal inner loop filter for 2 or 4 pixels
+        vabd.u16        d5,  d22, d23           @ abs(p1 - p0)
+.if \wd == 16
+        vmax.u16        d7,  d7,  d1
+        vmax.u16        d9,  d9,  d11
+.elseif \wd == 8
+        vmax.u16        d6,  d6,  d1
+.endif
+        vabd.u16        d1,  d25, d24           @ abs(q1 - q0)
+.if \wd == 16
+        vmax.u16        d7,  d7,  d9
+.elseif \wd == 8
+        vmax.u16        d6,  d6,  \tmp2
+.endif
+        vdup.u16        \tmp2,  r6              @ left shift for saturation
+        vsub.u16        \tmp1,  d22, d25        @ p1 - q1
+        vneg.s16        \tmp6,  \tmp2           @ negative left shift after saturation
+        vmax.u16        d5,  d5,  d1            @ max(abs(p1 - p0), abs(q1 - q0))
+        vsub.u16        \tmp3,   d24, d23       @ q0 - p0
+        vmov.s16        \tmp5,  #3
+.if \wd == 8
+        vcle.u16        d6,  d6,  d0            @ flat8in
+.endif
+        vcle.u16        d5,  d5,  d3            @ !hev
+.if \wd == 8
+        vand            d6,  d6,  d4            @ flat8in && fm
+.endif
+        vqshl.s16       \tmp1,  \tmp1,  \tmp2
+.if \wd == 16
+        vcle.u16        d7,  d7,  d0            @ flat8out
+.elseif \wd == 8
+        vbic            d4,  d4,  d6            @ fm && !flat8in
+.endif
+        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
+.if \wd == 16
+        vand            d7,  d7,  d6            @ flat8out && flat8in && fm
+.endif
+        vshl.s16        \tmp1,  \tmp1,  \tmp6   @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+        vmul.s16        \tmp3,  \tmp3,  \tmp5   @ 3 * (q0 - p0)
+        vbic            \tmp1,  \tmp1,   d5     @ if (!hev) av_clip_int2p = 0
+        vmov.s16        d2,  #4
+        vadd.s16        \tmp3,  \tmp3,  \tmp1   @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
+        vmov.s16        d3,  #3
+        vqshl.s16       \tmp1,  \tmp3,  \tmp2
+        vmov.s16        \tmp5,  #0
+        vshl.s16        \tmp1,  \tmp1,  \tmp6   @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+        vdup.u16        \tmp6,  r7              @ max pixel value
+.if \wd == 16
+        vbic            d6,  d6,  d7            @ fm && flat8in && !flat8out
+.endif
+
+        vshr.u16        \tmp2,  \tmp6,  #1      @ (1 << (BIT_DEPTH - 1)) - 1
+
+        vadd.s16        \tmp3,  \tmp1,  d2      @ f + 4
+        vadd.s16        \tmp4,  \tmp1,  d3      @ f + 3
+        vmin.s16        \tmp3,  \tmp3,  \tmp2   @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+        vmin.s16        \tmp4,  \tmp4,  \tmp2   @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+        vshr.s16        \tmp3,  \tmp3,  #3      @ f1
+        vshr.s16        \tmp4,  \tmp4,  #3      @ f2
+
+        vadd.s16        d0,  d23, \tmp4         @ p0 + f2
+        vsub.s16        d2,  d24, \tmp3         @ q0 - f1
+        vmin.s16        d0,  d0,  \tmp6
+        vmin.s16        d2,  d2,  \tmp6
+        vrshr.s16       \tmp3,  \tmp3,  #1      @ f = (f1 + 1) >> 1
+        vmax.s16        d0,  d0,  \tmp5         @ out p0
+        vmax.s16        d2,  d2,  \tmp5         @ out q0
+        vbit            d23, d0,  d4            @ if (fm && !flat8in)
+        vbit            d24, d2,  d4
+
+        vadd.s16        d0,  d22, \tmp3         @ p1 + f
+        vsub.s16        d2,  d25, \tmp3         @ q1 - f
+.if \wd >= 8
+        vmov            r8,  r9,  d6
+.endif
+        vmin.s16        d0,  d0,  \tmp6
+        vmin.s16        d2,  d2,  \tmp6
+.if \wd >= 8
+        orrs            r8,  r8,  r9
+.endif
+        vmax.s16        d0,  d0,  \tmp5         @ out p1
+        vmax.s16        d2,  d2,  \tmp5         @ out q1
+        vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
+        vbit            d25, d2,  d5
+
+.if \wd >= 8
+        @ If no pixels need flat8in, jump to flat8out
+        @ (or to a writeout of the inner 4 pixels, for wd=8)
+        beq             6f
+
+        @ flat8in
+        vadd.u16        \tmp1,  d20, d21
+        vadd.u16        \tmp3,  d22, d25
+        vadd.u16        \tmp5,  d20, d22
+        vadd.u16        \tmp7,  d23, d26
+        vadd.u16        d0,  \tmp1,  \tmp1
+        vadd.u16        d0,  d0,  d23
+        vadd.u16        d0,  d0,  d24
+        vadd.u16        d0,  d0,  \tmp5
+        vsub.s16        \tmp3,  \tmp3,  \tmp1
+        vsub.s16        \tmp7,  \tmp7,  \tmp5
+        vrshr.u16       d2,  d0,  #3            @ out p2
+
+        vadd.u16        d0,  d0,  \tmp3
+        vadd.u16        \tmp1,  d20, d23
+        vadd.u16        \tmp3,  d24, d27
+        vrshr.u16       d3,  d0,  #3            @ out p1
+
+        vadd.u16        d0,  d0,  \tmp7
+        vsub.s16        \tmp3,  \tmp3,  \tmp1
+        vadd.u16        \tmp5,  d21, d24
+        vadd.u16        \tmp7,  d25, d27
+        vrshr.u16       d4,  d0,  #3            @ out p0
+
+        vadd.u16        d0,  d0,  \tmp3
+        vsub.s16        \tmp7,  \tmp7,  \tmp5
+        vadd.u16        \tmp1,  d22, d25
+        vadd.u16        \tmp3,  d26, d27
+        vrshr.u16       d5,  d0,  #3            @ out d0
+
+        vadd.u16        d0,  d0,  \tmp7
+        vsub.s16        \tmp3,  \tmp3,  \tmp1
+        vrshr.u16       \tmp5,  d0,  #3         @ out q1
+
+        vadd.u16        d0,  d0,  \tmp3
+        @ The output here is written back into the input registers. This doesn't
+        @ matter for the flat8out part below, since we only update those pixels
+        @ which won't be touched below.
+        vbit            d21, d2,  d6
+        vbit            d22, d3,  d6
+        vbit            d23, d4,  d6
+        vrshr.u16       \tmp6,  d0,  #3         @ out q2
+        vbit            d24, d5,  d6
+        vbit            d25, \tmp5,  d6
+        vbit            d26, \tmp6,  d6
+.endif
+.if \wd == 16
+6:
+        vorr            d2,  d6,  d7
+        vmov            r8,  r9,  d2
+        orrs            r8,  r8,  r9
+        @ If no pixels needed flat8in nor flat8out, jump to a
+        @ writeout of the inner 4 pixels
+        beq             7f
+        vmov            r8,  r9,  d7
+        orrs            r8,  r8,  r9
+        @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+        beq             8f
+
+        @ flat8out
+        @ This writes all outputs into d2-d17 (skipping d6 and d16).
+        @ If this part is skipped, the output is read from d21-d26 (which is the input
+        @ to this section).
+        vshl.u16        d0,  d16, #3  @ 8 * d16
+        vsub.u16        d0,  d0,  d16 @ 7 * d16
+        vadd.u16        d0,  d0,  d17
+        vadd.u16        d8,  d17, d18
+        vadd.u16        d10, d19, d20
+        vadd.s16        d0,  d0,  d8
+        vadd.u16        d8,  d16, d17
+        vadd.u16        d12, d21, d22
+        vadd.s16        d0,  d0,  d10
+        vadd.u16        d10, d18, d25
+        vadd.u16        d14, d23, d24
+        vsub.s16        d10, d10, d8
+        vadd.s16        d0,  d0,  d12
+        vadd.s16        d0,  d0,  d14
+        vadd.u16        d12, d16, d18
+        vadd.u16        d14, d19, d26
+        vrshr.u16       d2,  d0,  #4
+
+        vadd.s16        d0,  d0,  d10
+        vadd.u16        d8,  d16, d19
+        vadd.u16        d10, d20, d27
+        vsub.s16        d14, d14, d12
+        vbif            d2,  d17, d7
+        vrshr.u16       d3,  d0,  #4
+
+        vadd.s16        d0,  d0,  d14
+        vadd.u16        d12, d16, d20
+        vadd.u16        d14, d21, d28
+        vsub.s16        d10, d10, d8
+        vbif            d3,  d18, d7
+        vrshr.u16       d4,  d0,  #4
+
+        vadd.s16        d0,  d0,  d10
+        vadd.u16        d8,  d16, d21
+        vadd.u16        d10, d22, d29
+        vsub.s16        d14, d14, d12
+        vbif            d4,  d19, d7
+        vrshr.u16       d5,  d0,  #4
+
+        vadd.s16        d0,  d0,  d14
+        vadd.u16        d12, d16, d22
+        vadd.u16        d14, d23, d30
+        vsub.s16        d10, d10, d8
+        vbif            d5,  d20, d7
+        vrshr.u16       d6,  d0,  #4
+
+        vadd.s16        d0,  d0,  d10
+        vadd.u16        d10, d16, d23
+        vsub.s16        d14, d14, d12
+        vadd.u16        d12, d24, d31
+        vbif            d6,  d21, d7
+        vrshr.u16       d8,  d0,  #4
+
+        vadd.s16        d0,  d0,  d14
+        vsub.s16        d10, d12, d10
+        vadd.u16        d12, d17, d24
+        vadd.u16        d14, d25, d31
+        vbif            d8,  d22, d7
+        vrshr.u16       d9,  d0,  #4
+
+        vadd.s16        d0,  d0,  d10
+        vsub.s16        d14, d14, d12
+        vadd.u16        d12, d26, d31
+        vbif            d9,  d23, d7
+        vrshr.u16       d10, d0,  #4
+
+        vadd.s16        d0,  d0,  d14
+        vadd.u16        d14, d18, d25
+        vadd.u16        d18, d19, d26
+        vsub.s16        d12, d12, d14
+        vadd.u16        d14, d27, d31
+        vbif            d10, d24, d7
+        vrshr.u16       d11, d0,  #4
+
+        vadd.s16        d0,  d0,  d12
+        vadd.u16        d12, d20, d27
+        vsub.s16        d14, d14, d18
+        vadd.u16        d18, d28, d31
+        vbif            d11, d25, d7
+        vsub.s16        d18, d18, d12
+        vrshr.u16       d12, d0,  #4
+
+        vadd.s16        d0,  d0,  d14
+        vadd.u16        d14, d21, d28
+        vadd.u16        d20, d29, d31
+        vbif            d12, d26, d7
+        vrshr.u16       d13, d0,  #4
+
+        vadd.s16        d0,  d0,  d18
+        vsub.s16        d20, d20, d14
+        vadd.u16        d18, d22, d29
+        vadd.u16        d22, d30, d31
+        vbif            d13, d27, d7
+        vrshr.u16       d14, d0,  #4
+
+        vadd.s16        d0,  d0,  d20
+        vsub.s16        d22, d22, d18
+        vbif            d14, d28, d7
+        vrshr.u16       d15, d0,  #4
+
+        vadd.s16        d0,  d0,  d22
+        vbif            d15, d29, d7
+        vrshr.u16       d17, d0,  #4
+        vbif            d17, d30, d7
+.endif
+.endm
+
+.macro loop_filter_q_4
+        loop_filter_q   4
+.endm
+
+.macro loop_filter_q_8
+        loop_filter_q   8
+.endm
+
+.macro loop_filter_16
+        loop_filter     16, d8,  d9,  d10, d11, d12, d13, d14, d15
+.endm
+
+
+@ The public functions in this file have got the following signature:
+@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+.macro bpp_frontend func, bpp
+function ff_\func\()_\bpp\()_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        vpush           {q4-q7}
+        lsl             r2,  r2,  #\bpp - 8
+        lsl             r3,  r3,  #\bpp - 8
+        lsl             r4,  r4,  #\bpp - 8
+        mov             r5,  #1 << (\bpp - 8)
+        mov             r6,  #16 - \bpp
+        movw            r7,  #((1 << \bpp) - 1)
+        bl              \func\()_16_neon
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro bpp_frontends func
+        bpp_frontend    \func, 10
+        bpp_frontend    \func, 12
+.endm
+
+.macro bpp_frontend_rep func, suffix, int_suffix, rep, dir, bpp
+function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        vpush           {q4-q7}
+        lsl             r2,  r2,  #\bpp - 8
+        lsl             r3,  r3,  #\bpp - 8
+        lsl             r4,  r4,  #\bpp - 8
+        mov             r5,  #1 << (\bpp - 8)
+        mov             r6,  #16 - \bpp
+        movw            r7,  #((1 << \bpp) - 1)
+        bl              \func\()_\int_suffix\()_16_neon
+.ifc \dir,h
+        add             r0,  r0,  r1, lsl #2
+.else
+        add             r0,  r0,  #8
+.endif
+        bl              \func\()_\int_suffix\()_16_neon
+.if \rep >= 4
+.ifc \dir,h
+        add             r0,  r0,  r1, lsl #2
+        bl              \func\()_\int_suffix\()_16_neon
+        add             r0,  r0,  r1, lsl #2
+        bl              \func\()_\int_suffix\()_16_neon
+.else
+        add             r0,  r0,  #8
+        bl              \func\()_\int_suffix\()_16_neon
+        add             r0,  r0,  #8
+        bl              \func\()_\int_suffix\()_16_neon
+.endif
+.endif
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro bpp_frontends_rep func, suffix, int_suffix, rep, dir
+        bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 10
+        bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 12
+.endm
+
+.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
+function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        vpush           {q4-q7}
+        push            {r2, r3, r4}
+        and             r2,  r2,  #0xff
+        and             r3,  r3,  #0xff
+        and             r4,  r4,  #0xff
+        lsl             r2,  r2,  #\bpp - 8
+        lsl             r3,  r3,  #\bpp - 8
+        lsl             r4,  r4,  #\bpp - 8
+        mov             r5,  #1 << (\bpp - 8)
+        mov             r6,  #16 - \bpp
+        movw            r7,  #((1 << \bpp) - 1)
+        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
+.ifc \dir,h
+        add             r0,  r0,  r1, lsl #3
+.else
+        add             r0,  r0,  #16
+.endif
+        pop             {r2, r3, r4}
+        lsr             r2,  r2,  #8
+        lsr             r3,  r3,  #8
+        lsr             r4,  r4,  #8
+        lsl             r2,  r2,  #\bpp - 8
+        lsl             r3,  r3,  #\bpp - 8
+        lsl             r4,  r4,  #\bpp - 8
+        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
+        vpop            {q4-q7}
+        pop             {r4-r9,pc}
+endfunc
+.endm
+
+.macro bpp_frontends_mix2 wd1, wd2
+        bpp_frontend_mix2 \wd1, \wd2, v, 10
+        bpp_frontend_mix2 \wd1, \wd2, v, 12
+        bpp_frontend_mix2 \wd1, \wd2, h, 10
+        bpp_frontend_mix2 \wd1, \wd2, h, 12
+.endm
+
+function vp9_loop_filter_v_4_8_16_neon
+        sub             r12, r0,  r1, lsl #2
+        vld1.16         {q8},  [r12,:128], r1 @ p3
+        vld1.16         {q12}, [r0, :128], r1 @ q0
+        vld1.16         {q9},  [r12,:128], r1 @ p2
+        vld1.16         {q13}, [r0, :128], r1 @ q1
+        vld1.16         {q10}, [r12,:128], r1 @ p1
+        vld1.16         {q14}, [r0, :128], r1 @ q2
+        vld1.16         {q11}, [r12,:128], r1 @ p0
+        vld1.16         {q15}, [r0, :128], r1 @ q3
+        sub             r0,  r0,  r1, lsl #2
+        sub             r12, r12, r1, lsl #1
+
+        loop_filter_q_4
+
+        vst1.16         {q10}, [r12,:128], r1
+        vst1.16         {q12}, [r0, :128], r1
+        vst1.16         {q11}, [r12,:128], r1
+        vst1.16         {q13}, [r0, :128], r1
+        sub             r0,  r0,  r1, lsl #1
+9:
+        bx              lr
+endfunc
+
+bpp_frontends vp9_loop_filter_v_4_8
+
+
+function vp9_loop_filter_h_4_8_16_neon
+        sub             r12, r0,  #8
+        add             r0,  r12, r1, lsl #2
+        vld1.16         {q8},  [r12,:64], r1
+        vld1.16         {q12}, [r0, :64], r1
+        vld1.16         {q9},  [r12,:64], r1
+        vld1.16         {q13}, [r0, :64], r1
+        vld1.16         {q10}, [r12,:64], r1
+        vld1.16         {q14}, [r0, :64], r1
+        vld1.16         {q11}, [r12,:64], r1
+        vld1.16         {q15}, [r0, :64], r1
+
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
+        @ outermost 2 pixels since they aren't changed.
+        add             r12, r12, #4
+        add             r0,  r0,  #4
+
+        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        loop_filter_q_4
+
+        @ We only will write the mid 4 pixels back; after the loop filter,
+        @ these are in q10, q11, q12, q13, ordered as rows (8x4 pixels).
+        @ We need to transpose them to columns, done with a
+        @ 4x4 transpose (which in practice is two 4x4 transposes of the two
+        @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
+        transpose16_4x4 q10, q11, q12, q13
+
+        vst1.16         {d20}, [r12], r1
+        vst1.16         {d21}, [r0],  r1
+        vst1.16         {d22}, [r12], r1
+        vst1.16         {d23}, [r0],  r1
+        vst1.16         {d24}, [r12], r1
+        vst1.16         {d25}, [r0],  r1
+        vst1.16         {d26}, [r12], r1
+        vst1.16         {d27}, [r0],  r1
+        sub             r12, r12, r1, lsl #2
+9:
+        add             r0,  r12, #4
+        bx              lr
+endfunc
+
+bpp_frontends vp9_loop_filter_h_4_8
+
+
+function vp9_loop_filter_v_8_8_16_neon
+        sub             r12, r0,  r1, lsl #2
+        vld1.16         {q8},  [r12,:128], r1 @ p3
+        vld1.16         {q12}, [r0, :128], r1 @ q0
+        vld1.16         {q9},  [r12,:128], r1 @ p2
+        vld1.16         {q13}, [r0, :128], r1 @ q1
+        vld1.16         {q10}, [r12,:128], r1 @ p1
+        vld1.16         {q14}, [r0, :128], r1 @ q2
+        vld1.16         {q11}, [r12,:128], r1 @ p0
+        vld1.16         {q15}, [r0, :128], r1 @ q3
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        add             r12, r12, r1
+
+        loop_filter_q_8
+
+        vst1.16         {q9},  [r12,:128], r1
+        vst1.16         {q12}, [r0, :128], r1
+        vst1.16         {q10}, [r12,:128], r1
+        vst1.16         {q13}, [r0, :128], r1
+        vst1.16         {q11}, [r12,:128], r1
+        vst1.16         {q14}, [r0, :128], r1
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+9:
+        bx              lr
+6:
+        sub             r12, r0,  r1, lsl #1
+        vst1.16         {q10}, [r12,:128], r1
+        vst1.16         {q12}, [r0, :128], r1
+        vst1.16         {q11}, [r12,:128], r1
+        vst1.16         {q13}, [r0, :128], r1
+        sub             r0,  r0,  r1, lsl #1
+        bx              lr
+endfunc
+
+bpp_frontends vp9_loop_filter_v_8_8
+
+
+function vp9_loop_filter_h_8_8_16_neon
+        sub             r12, r0,  #8
+        add             r0,  r12, r1, lsl #2
+        vld1.16         {q8},  [r12,:64], r1
+        vld1.16         {q12}, [r0, :64], r1
+        vld1.16         {q9},  [r12,:64], r1
+        vld1.16         {q13}, [r0, :64], r1
+        vld1.16         {q10}, [r12,:64], r1
+        vld1.16         {q14}, [r0, :64], r1
+        vld1.16         {q11}, [r12,:64], r1
+        vld1.16         {q15}, [r0, :64], r1
+
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+
+        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        loop_filter_q_8
+
+        @ Even though only 6 pixels per row have been changed, we write the
+        @ full 8 pixel registers.
+        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        vst1.16         {q8},  [r12,:64], r1
+        vst1.16         {q12}, [r0, :64], r1
+        vst1.16         {q9},  [r12,:64], r1
+        vst1.16         {q13}, [r0, :64], r1
+        vst1.16         {q10}, [r12,:64], r1
+        vst1.16         {q14}, [r0, :64], r1
+        vst1.16         {q11}, [r12,:64], r1
+        vst1.16         {q15}, [r0, :64], r1
+        sub             r12, r12, r1, lsl #2
+9:
+        add             r0,  r12, #8
+        bx              lr
+6:
+        @ If we didn't need to do the flat8in part, we use the same writeback
+        @ as in loop_filter_h_4_8.
+        add             r12, r12, #4
+        add             r0,  r0,  #4
+        transpose16_4x4 q10, q11, q12, q13
+
+        vst1.16         {d20}, [r12], r1
+        vst1.16         {d21}, [r0],  r1
+        vst1.16         {d22}, [r12], r1
+        vst1.16         {d23}, [r0],  r1
+        vst1.16         {d24}, [r12], r1
+        vst1.16         {d25}, [r0],  r1
+        vst1.16         {d26}, [r12], r1
+        vst1.16         {d27}, [r0],  r1
+        sub             r12, r12, r1, lsl #2
+        add             r0,  r12, #4
+        bx              lr
+endfunc
+
+bpp_frontends vp9_loop_filter_h_8_8
+
+bpp_frontends_mix2 4, 4
+bpp_frontends_mix2 4, 8
+bpp_frontends_mix2 8, 4
+bpp_frontends_mix2 8, 8
+
+function vp9_loop_filter_v_16_4_16_neon
+        sub             r12, r0,  r1, lsl #3
+        @ Read p7-p0 using r12 and q0-q7 using r0
+        vld1.16         {d16}, [r12,:64], r1 @ p7
+        vld1.16         {d24}, [r0, :64], r1 @ q0
+        vld1.16         {d17}, [r12,:64], r1 @ p6
+        vld1.16         {d25}, [r0, :64], r1 @ q1
+        vld1.16         {d18}, [r12,:64], r1 @ p5
+        vld1.16         {d26}, [r0, :64], r1 @ q2
+        vld1.16         {d19}, [r12,:64], r1 @ p4
+        vld1.16         {d27}, [r0, :64], r1 @ q3
+        vld1.16         {d20}, [r12,:64], r1 @ p3
+        vld1.16         {d28}, [r0, :64], r1 @ q4
+        vld1.16         {d21}, [r12,:64], r1 @ p2
+        vld1.16         {d29}, [r0, :64], r1 @ q5
+        vld1.16         {d22}, [r12,:64], r1 @ p1
+        vld1.16         {d30}, [r0, :64], r1 @ q6
+        vld1.16         {d23}, [r12,:64], r1 @ p0
+        vld1.16         {d31}, [r0, :64], r1 @ q7
+        sub             r12, r12, r1, lsl #3
+        sub             r0,  r0,  r1, lsl #3
+        add             r12, r12, r1
+
+        loop_filter_16
+
+        @ If we did the flat8out part, we get the output in
+        @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
+        @ store d2-d9 there, and d10-d17 into r0.
+        vst1.16         {d2},  [r12,:64], r1
+        vst1.16         {d10}, [r0, :64], r1
+        vst1.16         {d3},  [r12,:64], r1
+        vst1.16         {d11}, [r0, :64], r1
+        vst1.16         {d4},  [r12,:64], r1
+        vst1.16         {d12}, [r0, :64], r1
+        vst1.16         {d5},  [r12,:64], r1
+        vst1.16         {d13}, [r0, :64], r1
+        vst1.16         {d6},  [r12,:64], r1
+        vst1.16         {d14}, [r0, :64], r1
+        vst1.16         {d8},  [r12,:64], r1
+        vst1.16         {d15}, [r0, :64], r1
+        vst1.16         {d9},  [r12,:64], r1
+        vst1.16         {d17}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  r1
+
+9:
+        bx              lr
+
+8:
+        add             r12, r12, r1, lsl #2
+        @ If we didn't do the flat8out part, the output is left in the
+        @ input registers.
+        vst1.16         {d21}, [r12,:64], r1
+        vst1.16         {d24}, [r0, :64], r1
+        vst1.16         {d22}, [r12,:64], r1
+        vst1.16         {d25}, [r0, :64], r1
+        vst1.16         {d23}, [r12,:64], r1
+        vst1.16         {d26}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+        bx              lr
+7:
+        sub             r12, r0,  r1, lsl #1
+        vst1.16         {d22}, [r12,:64], r1
+        vst1.16         {d24}, [r0, :64], r1
+        vst1.16         {d23}, [r12,:64], r1
+        vst1.16         {d25}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #1
+        bx              lr
+endfunc
+
+bpp_frontends_rep vp9_loop_filter_v_16, 8,  4, 2, v
+bpp_frontends_rep vp9_loop_filter_v_16, 16, 4, 4, v
+
+function vp9_loop_filter_h_16_4_16_neon
+        sub             r12, r0,  #16
+        sub             r0,  r0,  #8
+        vld1.16         {d16}, [r12,:64], r1
+        vld1.16         {d20}, [r0, :64], r1
+        vld1.16         {d17}, [r12,:64], r1
+        vld1.16         {d21}, [r0, :64], r1
+        vld1.16         {d18}, [r12,:64], r1
+        vld1.16         {d22}, [r0, :64], r1
+        vld1.16         {d19}, [r12,:64], r1
+        vld1.16         {d23}, [r0, :64], r1
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        add             r12, r12, #16
+        add             r0,  r0,  #16
+        vld1.16         {d24}, [r12,:64], r1
+        vld1.16         {d28}, [r0, :64], r1
+        vld1.16         {d25}, [r12,:64], r1
+        vld1.16         {d29}, [r0, :64], r1
+        vld1.16         {d26}, [r12,:64], r1
+        vld1.16         {d30}, [r0, :64], r1
+        vld1.16         {d27}, [r12,:64], r1
+        vld1.16         {d31}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r12, r12, r1, lsl #2
+        sub             r12, r12, #16
+        sub             r0,  r0,  #16
+
+        @ The 16x4 pixels read above is in four 4x4 blocks
+        transpose16_q_4x4 q8,  q9,  d16, d17, d18, d19
+        transpose16_q_4x4 q10, q11, d20, d21, d22, d23
+        transpose16_q_4x4 q12, q13, d24, d25, d26, d27
+        transpose16_q_4x4 q14, q15, d28, d29, d30, d31
+
+        loop_filter_16
+
+        @ Transpose back; this is the same transpose as above, but
+        @ we can't take advantage of q registers for the transpose, since
+        @ all d registers in the transpose aren't consecutive.
+        transpose16_4x4 d16, d2,  d3,  d4
+        transpose16_4x4 d5,  d6,  d8,  d9
+        transpose16_4x4 d10, d11, d12, d13
+        transpose16_4x4 d14, d15, d17, d31
+
+        vst1.16         {d16}, [r12,:64], r1
+        vst1.16         {d5},  [r0, :64], r1
+
+        vst1.16         {d2},  [r12,:64], r1
+        vst1.16         {d6},  [r0, :64], r1
+
+        vst1.16         {d3},  [r12,:64], r1
+        vst1.16         {d8},  [r0, :64], r1
+
+        vst1.16         {d4},  [r12,:64], r1
+        vst1.16         {d9},  [r0, :64], r1
+
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        add             r12, r12, #16
+        add             r0,  r0,  #16
+
+        vst1.16         {d10}, [r12,:64], r1
+        vst1.16         {d14}, [r0, :64], r1
+
+        vst1.16         {d11}, [r12,:64], r1
+        vst1.16         {d15}, [r0, :64], r1
+
+        vst1.16         {d12}, [r12,:64], r1
+        vst1.16         {d17}, [r0, :64], r1
+
+        vst1.16         {d13}, [r12,:64], r1
+        vst1.16         {d31}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r0,  r0,  #8
+        bx              lr
+9:
+        add             r0,  r0,  #8
+        bx              lr
+8:
+        add             r12, r12, #8
+        add             r0,  r0,  #8
+        transpose16_q_4x4 q10, q11, d20, d21, d22, d23
+        transpose16_q_4x4 q12, q13, d24, d25, d26, d27
+
+        vst1.16         {d20}, [r12,:64], r1
+        vst1.16         {d24}, [r0, :64], r1
+        vst1.16         {d21}, [r12,:64], r1
+        vst1.16         {d25}, [r0, :64], r1
+        vst1.16         {d22}, [r12,:64], r1
+        vst1.16         {d26}, [r0, :64], r1
+        vst1.16         {d23}, [r12,:64], r1
+        vst1.16         {d27}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #2
+        bx              lr
+7:
+        add             r12, r12, #12
+        add             r0,  r12, r1, lsl #1
+        transpose16_q_4x4 q11, q12, d22, d23, d24, d25
+
+        vst1.16         {d22}, [r12], r1
+        vst1.16         {d24}, [r0],  r1
+        vst1.16         {d23}, [r12], r1
+        vst1.16         {d25}, [r0],  r1
+        sub             r0,  r0,  r1, lsl #2
+        add             r0,  r0,  #4
+        bx              lr
+endfunc
+
+bpp_frontends_rep vp9_loop_filter_h_16, 8,  4, 2, h
+bpp_frontends_rep vp9_loop_filter_h_16, 16, 4, 4, h
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
new file mode 100644
index 0000000000..4b3608064a
--- /dev/null
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -0,0 +1,959 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+@ Do an 8x8 transpose, using q registers for the subtransposes that don't
+@ need to address the indiviudal d registers.
+@ r0,r1 == rq0, r2,r3 == rq1, etc
+.macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7
+        vtrn.32         \rq0, \rq2
+        vtrn.32         \rq1, \rq3
+        vtrn.16         \rq0, \rq1
+        vtrn.16         \rq2, \rq3
+        vtrn.8          \r0,  \r1
+        vtrn.8          \r2,  \r3
+        vtrn.8          \r4,  \r5
+        vtrn.8          \r6,  \r7
+.endm
+
+@ Do a 4x4 transpose, using q registers for the subtransposes that don't
+@ need to address the indiviudal d registers.
+@ r0,r1 == rq0, r2,r3 == rq1
+.macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3
+        vtrn.16         \rq0, \rq1
+        vtrn.8          \r0,  \r1
+        vtrn.8          \r2,  \r3
+.endm
+
+@ The input to and output from this macro is in the registers q8-q15,
+@ and q0-q7 are used as scratch registers.
+@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
+.macro loop_filter_q
+        vdup.u8         d0,  r2          @ E
+        lsr             r2,  r2,  #8
+        vdup.u8         d2,  r3          @ I
+        lsr             r3,  r3,  #8
+        vdup.u8         d1,  r2          @ E
+        vdup.u8         d3,  r3          @ I
+
+        vabd.u8         q2,  q8,  q9     @ abs(p3 - p2)
+        vabd.u8         q3,  q9,  q10    @ abs(p2 - p1)
+        vabd.u8         q4,  q10, q11    @ abs(p1 - p0)
+        vabd.u8         q5,  q12, q13    @ abs(q0 - q1)
+        vabd.u8         q6,  q13, q14    @ abs(q1 - q2)
+        vabd.u8         q7,  q14, q15    @ abs(q2 - q3)
+        vmax.u8         q2,  q2,  q3
+        vmax.u8         q3,  q4,  q5
+        vmax.u8         q4,  q6,  q7
+        vabd.u8         q5,  q11, q12    @ abs(p0 - q0)
+        vmax.u8         q2,  q2,  q3
+        vqadd.u8        q5,  q5,  q5     @ abs(p0 - q0) * 2
+        vabd.u8         q7,  q10, q13    @ abs(p1 - q1)
+        vmax.u8         q2,  q2,  q4     @ max(abs(p3 - p2), ..., abs(q2 - q3))
+        vshr.u8         q7,  q7,  #1
+        vcle.u8         q2,  q2,  q1     @ max(abs()) <= I
+        vqadd.u8        q5,  q5,  q7     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        vcle.u8         q5,  q5,  q0
+        vand            q2,  q2,  q5     @ fm
+
+        vshrn.u16       d10, q2,  #4
+        vmov            r2,  r3,  d10
+        orrs            r2,  r2,  r3
+        @ If no pixels need filtering, just exit as soon as possible
+        beq             9f
+
+        @ Calculate the normal inner loop filter for 2 or 4 pixels
+        ldr             r3,  [sp, #64]
+        vabd.u8         q3,  q10, q11    @ abs(p1 - p0)
+        vabd.u8         q4,  q13, q12    @ abs(q1 - q0)
+
+        vsubl.u8        q5,  d20, d26    @ p1 - q1
+        vsubl.u8        q6,  d21, d27    @ p1 - q1
+        vmax.u8         q3,  q3,  q4     @ max(abs(p1 - p0), abs(q1 - q0))
+        vqmovn.s16      d10, q5          @ av_clip_int8p(p1 - q1)
+        vqmovn.s16      d11, q6          @ av_clip_int8p(p1 - q1)
+        vdup.u8         d8,  r3          @ H
+        lsr             r3,  r3,  #8
+        vdup.u8         d9,  r3          @ H
+        vsubl.u8        q6,  d24, d22    @ q0 - p0
+        vsubl.u8        q7,  d25, d23    @ q0 - p0
+        vcle.u8         q3,  q3,  q4     @ hev
+        vmov.s16        q0,  #3
+        vand            q3,  q3,  q2     @ !hev && fm && !flat8in
+
+        vmul.s16        q6,  q6,  q0     @ 3 * (q0 - p0)
+        vmul.s16        q7,  q7,  q0     @ 3 * (q0 - p0)
+        vbic            q5,  q5,  q3     @ if (!hev) av_clip_int8 = 0
+        vaddw.s8        q6,  q6,  d10    @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+        vaddw.s8        q7,  q7,  d11    @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+        vmov.s8         q5,  #4
+        vqmovn.s16      d12, q6
+        vqmovn.s16      d13, q7          @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f
+        vmov.s8         q0,  #3
+
+        vqadd.s8        q5,  q6,  q5     @ FFMIN(f + 4, 127)
+        vqadd.s8        q0,  q6,  q0     @ FFMIN(f + 3, 127)
+        vmovl.u8        q6,  d22         @ p0
+        vmovl.u8        q7,  d23         @ p0
+        vshr.s8         q5,  q5,  #3     @ f1
+        vshr.s8         q0,  q0,  #3     @ f2
+
+        vaddw.s8        q6,  q6,  d0     @ p0 + f2
+        vaddw.s8        q7,  q7,  d1     @ p0 + f2
+        vqmovun.s16     d0,  q6          @ out p0
+        vmovl.u8        q6,  d24         @ q0
+        vqmovun.s16     d1,  q7          @ out p0
+        vmovl.u8        q7,  d25         @ q0
+        vsubw.s8        q6,  q6,  d10    @ q0 - f1
+        vsubw.s8        q7,  q7,  d11    @ q0 - f1
+        vqmovun.s16     d12, q6          @ out q0
+        vqmovun.s16     d13, q7          @ out q0
+        vrshr.s8        q5,  q5,  #1     @ f = (f1 + 1) >> 1
+        vbit            q11, q0,  q2     @ if (fm && !flat8in)
+        vbit            q12, q6,  q2
+
+        vmovl.u8        q0,  d20         @ p1
+        vmovl.u8        q2,  d21         @ p1
+        vmovl.u8        q6,  d26         @ q1
+        vmovl.u8        q7,  d27         @ q1
+        vaddw.s8        q0,  q0,  d10    @ p1 + f
+        vaddw.s8        q2,  q2,  d11    @ p1 + f
+        vsubw.s8        q6,  q6,  d10    @ q1 - f
+        vsubw.s8        q7,  q7,  d11    @ q1 - f
+        vqmovun.s16     d0,  q0          @ out p1
+        vqmovun.s16     d1,  q2          @ out p1
+        vqmovun.s16     d12, q6          @ out q1
+        vqmovun.s16     d13, q7          @ out q1
+        vbit            q10, q0,  q3     @ if (!hev && fm && !flat8in)
+        vbit            q13, q6,  q3
+.endm
+
+@ The input to and output from this macro is in the registers d16-d31,
+@ and d0-d7 are used as scratch registers.
+@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
+@ Depending on the width of the loop filter, we either use d16-d19
+@ and d28-d31 as temp registers, or d8-d15.
+@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
+        vdup.u8         d0,  r2 @ E
+        vdup.u8         d2,  r3 @ I
+        ldr             r3,  [sp]
+
+        vabd.u8         d4,  d20, d21    @ abs(p3 - p2)
+        vabd.u8         d5,  d21, d22    @ abs(p2 - p1)
+        vabd.u8         d6,  d22, d23    @ abs(p1 - p0)
+        vabd.u8         d7,  d24, d25    @ abs(q0 - q1)
+        vabd.u8         \tmp1,  d25, d26 @ abs(q1 - q2)
+        vabd.u8         \tmp2,  d26, d27 @ abs(q2 - q3)
+        vmax.u8         d4,  d4,  d5
+        vmax.u8         d5,  d6,  d7
+        vmax.u8         \tmp1,  \tmp1,  \tmp2
+        vabd.u8         d6,  d23, d24    @ abs(p0 - q0)
+        vmax.u8         d4,  d4,  d5
+        vqadd.u8        d6,  d6,  d6     @ abs(p0 - q0) * 2
+        vabd.u8         d5,  d22, d25    @ abs(p1 - q1)
+        vmax.u8         d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
+        vshr.u8         d5,  d5,  #1
+        vcle.u8         d4,  d4,  d2     @ max(abs()) <= I
+        vqadd.u8        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        vcle.u8         d5,  d6,  d0
+        vand            d4,  d4,  d5     @ fm
+
+        vdup.u8         d3,  r3          @ H
+        vmov            r2,  r3,  d4
+        orrs            r2,  r2,  r3
+        @ If no pixels need filtering, just exit as soon as possible
+        beq             9f
+
+.if \wd >= 8
+        vmov.u8         d0,  #1
+
+        vabd.u8         d6,  d20, d23    @ abs(p3 - p0)
+        vabd.u8         d2,  d21, d23    @ abs(p2 - p0)
+        vabd.u8         d1,  d22, d23    @ abs(p1 - p0)
+        vabd.u8         \tmp1,  d25, d24 @ abs(q1 - q0)
+        vabd.u8         \tmp2,  d26, d24 @ abs(q2 - q0)
+        vabd.u8         \tmp3,  d27, d24 @ abs(q3 - q0)
+        vmax.u8         d6,  d6,  d2
+        vmax.u8         d1,  d1,  \tmp1
+        vmax.u8         \tmp2,  \tmp2,  \tmp3
+.if \wd == 16
+        vabd.u8         d7,  d16, d23    @ abs(p7 - p0)
+        vmax.u8         d6,  d6,  d1
+        vabd.u8         d2,  d17, d23    @ abs(p6 - p0)
+        vmax.u8         d6,  d6,  \tmp2
+        vabd.u8         d1,  d18, d23    @ abs(p5 - p0)
+        vcle.u8         d6,  d6,  d0     @ flat8in
+        vabd.u8         d8,  d19, d23    @ abs(p4 - p0)
+        vand            d6,  d6,  d4     @ flat8in && fm
+        vabd.u8         d9,  d28, d24    @ abs(q4 - q0)
+        vbic            d4,  d4,  d6     @ fm && !flat8in
+        vabd.u8         d10, d29, d24    @ abs(q5 - q0)
+        vabd.u8         d11, d30, d24    @ abs(q6 - q0)
+        vabd.u8         d12, d31, d24    @ abs(q7 - q0)
+
+        vmax.u8         d7,  d7,  d2
+        vmax.u8         d1,  d1,  d8
+        vmax.u8         d9,  d9,  d10
+        vmax.u8         d11, d11, d12
+        @ The rest of the calculation of flat8out is interleaved below
+.else
+        @ The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+        @ Calculate the normal inner loop filter for 2 or 4 pixels
+        vabd.u8         d5,  d22, d23           @ abs(p1 - p0)
+.if \wd == 16
+        vmax.u8         d7,  d7,  d1
+        vmax.u8         d9,  d9,  d11
+.elseif \wd == 8
+        vmax.u8         d6,  d6,  d1
+.endif
+        vabd.u8         d1,  d25, d24           @ abs(q1 - q0)
+.if \wd == 16
+        vmax.u8         d7,  d7,  d9
+.elseif \wd == 8
+        vmax.u8         d6,  d6,  \tmp2
+.endif
+        vsubl.u8        \tmpq1,  d22, d25       @ p1 - q1
+        vmax.u8         d5,  d5,  d1            @ max(abs(p1 - p0), abs(q1 - q0))
+        vsubl.u8        \tmpq2,  d24, d23       @ q0 - p0
+        vmov.s16        \tmpq3,  #3
+.if \wd == 8
+        vcle.u8         d6,  d6,  d0            @ flat8in
+.endif
+        vcle.u8         d5,  d5,  d3            @ !hev
+.if \wd == 8
+        vand            d6,  d6,  d4            @ flat8in && fm
+.endif
+        vqmovn.s16      \tmp1,   \tmpq1         @ av_clip_int8(p1 - q1)
+.if \wd == 16
+        vcle.u8         d7,  d7,  d0            @ flat8out
+.elseif \wd == 8
+        vbic            d4,  d4,  d6            @ fm && !flat8in
+.endif
+        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
+.if \wd == 16
+        vand            d7,  d7,  d6            @ flat8out && flat8in && fm
+.endif
+
+        vmul.s16        \tmpq2,  \tmpq2, \tmpq3 @ 3 * (q0 - p0)
+        vbic            \tmp1,   \tmp1,   d5    @ if (!hev) av_clip_int8 = 0
+        vmov.s8         d2,  #4
+        vaddw.s8        \tmpq2,  \tmpq2,  \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+        vmov.s8         d3,  #3
+        vqmovn.s16      \tmp1,   \tmpq2         @ f
+.if \wd == 16
+        vbic            d6,  d6,  d7            @ fm && flat8in && !flat8out
+.endif
+
+        vqadd.s8        \tmp3, \tmp1,  d2       @ FFMIN(f + 4, 127)
+        vqadd.s8        \tmp4, \tmp1,  d3       @ FFMIN(f + 3, 127)
+        vmovl.u8        q0,  d23                @ p0
+        vshr.s8         \tmp3, \tmp3,  #3       @ f1
+        vshr.s8         \tmp4, \tmp4,  #3       @ f2
+
+        vmovl.u8        q1,  d24                @ q0
+        vaddw.s8        q0,  q0,  \tmp4         @ p0 + f2
+        vsubw.s8        q1,  q1,  \tmp3         @ q0 - f1
+        vqmovun.s16     d0,  q0                 @ out p0
+        vqmovun.s16     d1,  q1                 @ out q0
+        vrshr.s8        \tmp3, \tmp3, #1        @ f = (f1 + 1) >> 1
+        vbit            d23, d0,  d4            @ if (fm && !flat8in)
+        vbit            d24, d1,  d4
+
+        vmovl.u8        q0,  d22                @ p1
+        vmovl.u8        q1,  d25                @ q1
+.if \wd >= 8
+        vmov            r2,  r3,  d6
+.endif
+        vaddw.s8        q0,  q0,  \tmp3         @ p1 + f
+        vsubw.s8        q1,  q1,  \tmp3         @ q1 - f
+.if \wd >= 8
+        orrs            r2,  r2,  r3
+.endif
+        vqmovun.s16     d0,  q0                 @ out p1
+        vqmovun.s16     d2,  q1                 @ out q1
+        vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
+        vbit            d25, d2,  d5
+
+.if \wd >= 8
+        @ If no pixels need flat8in, jump to flat8out
+        @ (or to a writeout of the inner 4 pixels, for wd=8)
+        beq             6f
+
+        @ flat8in
+        vaddl.u8        \tmpq1, d20, d21
+        vaddl.u8        \tmpq2, d22, d25
+        vaddl.u8        \tmpq3, d20, d22
+        vaddl.u8        \tmpq4, d23, d26
+        vadd.u16        q0,  \tmpq1, \tmpq1
+        vaddw.u8        q0,  q0,  d23
+        vaddw.u8        q0,  q0,  d24
+        vadd.u16        q0,  q0,  \tmpq3
+        vsub.s16        \tmpq2, \tmpq2, \tmpq1
+        vsub.s16        \tmpq4, \tmpq4, \tmpq3
+        vrshrn.u16      d2,  q0,  #3            @ out p2
+
+        vadd.u16        q0,  q0,  \tmpq2
+        vaddl.u8        \tmpq1, d20, d23
+        vaddl.u8        \tmpq2, d24, d27
+        vrshrn.u16      d3,  q0,  #3            @ out p1
+
+        vadd.u16        q0,  q0,  \tmpq4
+        vsub.s16        \tmpq2, \tmpq2, \tmpq1
+        vaddl.u8        \tmpq3, d21, d24
+        vaddl.u8        \tmpq4, d25, d27
+        vrshrn.u16      d4,  q0,  #3            @ out p0
+
+        vadd.u16        q0,  q0,  \tmpq2
+        vsub.s16        \tmpq4, \tmpq4, \tmpq3
+        vaddl.u8        \tmpq1, d22, d25
+        vaddl.u8        \tmpq2, d26, d27
+        vrshrn.u16      d5,  q0,  #3            @ out q0
+
+        vadd.u16        q0,  q0,  \tmpq4
+        vsub.s16        \tmpq2, \tmpq2, \tmpq1
+        vrshrn.u16      \tmp5,  q0,  #3         @ out q1
+
+        vadd.u16        q0,  q0,  \tmpq2
+        @ The output here is written back into the input registers. This doesn't
+        @ matter for the flat8out part below, since we only update those pixels
+        @ which won't be touched below.
+        vbit            d21, d2,  d6
+        vbit            d22, d3,  d6
+        vbit            d23, d4,  d6
+        vrshrn.u16      \tmp6,  q0,  #3         @ out q2
+        vbit            d24, d5,  d6
+        vbit            d25, \tmp5,  d6
+        vbit            d26, \tmp6,  d6
+.endif
+.if \wd == 16
+6:
+        vorr            d2,  d6,  d7
+        vmov            r2,  r3,  d2
+        orrs            r2,  r2,  r3
+        @ If no pixels needed flat8in nor flat8out, jump to a
+        @ writeout of the inner 4 pixels
+        beq             7f
+        vmov            r2,  r3,  d7
+        orrs            r2,  r2,  r3
+        @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+        beq             8f
+
+        @ flat8out
+        @ This writes all outputs into d2-d17 (skipping d6 and d16).
+        @ If this part is skipped, the output is read from d21-d26 (which is the input
+        @ to this section).
+        vshll.u8        q0,  d16, #3  @ 8 * d16
+        vsubw.u8        q0,  q0,  d16 @ 7 * d16
+        vaddw.u8        q0,  q0,  d17
+        vaddl.u8        q4,  d17, d18
+        vaddl.u8        q5,  d19, d20
+        vadd.s16        q0,  q0,  q4
+        vaddl.u8        q4,  d16, d17
+        vaddl.u8        q6,  d21, d22
+        vadd.s16        q0,  q0,  q5
+        vaddl.u8        q5,  d18, d25
+        vaddl.u8        q7,  d23, d24
+        vsub.s16        q5,  q5,  q4
+        vadd.s16        q0,  q0,  q6
+        vadd.s16        q0,  q0,  q7
+        vaddl.u8        q6,  d16, d18
+        vaddl.u8        q7,  d19, d26
+        vrshrn.u16      d2,  q0,  #4
+
+        vadd.s16        q0,  q0,  q5
+        vaddl.u8        q4,  d16, d19
+        vaddl.u8        q5,  d20, d27
+        vsub.s16        q7,  q7,  q6
+        vbif            d2,  d17, d7
+        vrshrn.u16      d3,  q0,  #4
+
+        vadd.s16        q0,  q0,  q7
+        vaddl.u8        q6,  d16, d20
+        vaddl.u8        q7,  d21, d28
+        vsub.s16        q5,  q5,  q4
+        vbif            d3,  d18, d7
+        vrshrn.u16      d4,  q0,  #4
+
+        vadd.s16        q0,  q0,  q5
+        vaddl.u8        q4,  d16, d21
+        vaddl.u8        q5,  d22, d29
+        vsub.s16        q7,  q7,  q6
+        vbif            d4,  d19, d7
+        vrshrn.u16      d5,  q0,  #4
+
+        vadd.s16        q0,  q0,  q7
+        vaddl.u8        q6,  d16, d22
+        vaddl.u8        q7,  d23, d30
+        vsub.s16        q5,  q5,  q4
+        vbif            d5,  d20, d7
+        vrshrn.u16      d6,  q0,  #4
+
+        vadd.s16        q0,  q0,  q5
+        vaddl.u8        q5,  d16, d23
+        vsub.s16        q7,  q7,  q6
+        vaddl.u8        q6,  d24, d31
+        vbif            d6,  d21, d7
+        vrshrn.u16      d8,  q0,  #4
+
+        vadd.s16        q0,  q0,  q7
+        vsub.s16        q5,  q6,  q5
+        vaddl.u8        q6,  d17, d24
+        vaddl.u8        q7,  d25, d31
+        vbif            d8,  d22, d7
+        vrshrn.u16      d9,  q0,  #4
+
+        vadd.s16        q0,  q0,  q5
+        vsub.s16        q7,  q7,  q6
+        vaddl.u8        q6,  d26, d31
+        vbif            d9,  d23, d7
+        vrshrn.u16      d10, q0,  #4
+
+        vadd.s16        q0,  q0,  q7
+        vaddl.u8        q7,  d18, d25
+        vaddl.u8        q9,  d19, d26
+        vsub.s16        q6,  q6,  q7
+        vaddl.u8        q7,  d27, d31
+        vbif            d10, d24, d7
+        vrshrn.u16      d11, q0,  #4
+
+        vadd.s16        q0,  q0,  q6
+        vaddl.u8        q6,  d20, d27
+        vsub.s16        q7,  q7,  q9
+        vaddl.u8        q9,  d28, d31
+        vbif            d11, d25, d7
+        vsub.s16        q9,  q9,  q6
+        vrshrn.u16      d12, q0,  #4
+
+        vadd.s16        q0,  q0,  q7
+        vaddl.u8        q7,  d21, d28
+        vaddl.u8        q10, d29, d31
+        vbif            d12, d26, d7
+        vrshrn.u16      d13, q0,  #4
+
+        vadd.s16        q0,  q0,  q9
+        vsub.s16        q10, q10, q7
+        vaddl.u8        q9,  d22, d29
+        vaddl.u8        q11, d30, d31
+        vbif            d13, d27, d7
+        vrshrn.u16      d14, q0,  #4
+
+        vadd.s16        q0,  q0,  q10
+        vsub.s16        q11, q11, q9
+        vbif            d14, d28, d7
+        vrshrn.u16      d15, q0,  #4
+
+        vadd.s16        q0,  q0,  q11
+        vbif            d15, d29, d7
+        vrshrn.u16      d17, q0,  #4
+        vbif            d17, d30, d7
+.endif
+.endm
+
+@ For wd <= 8, we use d16-d19 and d28-d31 for temp registers,
+@ while we need those for inputs/outputs in wd=16 and use d8-d15
+@ for temp registers there instead.
+.macro loop_filter_4
+        loop_filter     4,  d16, d17, d18, d19, d28, d29, d30, d31, q8,  q9,  q14, q15
+.endm
+
+.macro loop_filter_8
+        loop_filter     8,  d16, d17, d18, d19, d28, d29, d30, d31, q8,  q9,  q14, q15
+.endm
+
+.macro loop_filter_16
+        loop_filter     16, d8,  d9,  d10, d11, d12, d13, d14, d15, q4,  q5,  q6,  q7
+.endm
+
+
+@ The public functions in this file have got the following signature:
+@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+function ff_vp9_loop_filter_v_4_8_neon, export=1
+        sub             r12, r0,  r1, lsl #2
+        vld1.8          {d20}, [r12,:64], r1 @ p3
+        vld1.8          {d24}, [r0, :64], r1 @ q0
+        vld1.8          {d21}, [r12,:64], r1 @ p2
+        vld1.8          {d25}, [r0, :64], r1 @ q1
+        vld1.8          {d22}, [r12,:64], r1 @ p1
+        vld1.8          {d26}, [r0, :64], r1 @ q2
+        vld1.8          {d23}, [r12,:64], r1 @ p0
+        vld1.8          {d27}, [r0, :64], r1 @ q3
+        sub             r0,  r0,  r1, lsl #2
+        sub             r12, r12, r1, lsl #1
+
+        loop_filter_4
+
+        vst1.8          {d22}, [r12,:64], r1
+        vst1.8          {d24}, [r0, :64], r1
+        vst1.8          {d23}, [r12,:64], r1
+        vst1.8          {d25}, [r0, :64], r1
+9:
+        bx              lr
+endfunc
+
+function ff_vp9_loop_filter_h_4_8_neon, export=1
+        sub             r12, r0,  #4
+        add             r0,  r12, r1, lsl #2
+        vld1.8          {d20}, [r12], r1
+        vld1.8          {d24}, [r0],  r1
+        vld1.8          {d21}, [r12], r1
+        vld1.8          {d25}, [r0],  r1
+        vld1.8          {d22}, [r12], r1
+        vld1.8          {d26}, [r0],  r1
+        vld1.8          {d23}, [r12], r1
+        vld1.8          {d27}, [r0],  r1
+
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
+        @ outermost 2 pixels since they aren't changed.
+        add             r12, r12, #2
+        add             r0,  r0,  #2
+
+        @ Transpose the 8x8 pixels, taking advantage of q registers, to get
+        @ one register per column.
+        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+        loop_filter_4
+
+        @ We only will write the mid 4 pixels back; after the loop filter,
+        @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows
+        @ (8x4 pixels). We need to transpose them to columns, done with a
+        @ 4x4 transpose (which in practice is two 4x4 transposes of the two
+        @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
+        transpose_q_4x4 q11, q12, d22, d23, d24, d25
+
+        vst1.32         {d22[0]}, [r12], r1
+        vst1.32         {d22[1]}, [r0],  r1
+        vst1.32         {d23[0]}, [r12], r1
+        vst1.32         {d23[1]}, [r0],  r1
+        vst1.32         {d24[0]}, [r12], r1
+        vst1.32         {d24[1]}, [r0],  r1
+        vst1.32         {d25[0]}, [r12], r1
+        vst1.32         {d25[1]}, [r0],  r1
+9:
+        bx              lr
+endfunc
+
+function ff_vp9_loop_filter_v_44_16_neon, export=1
+        vpush           {q4-q7}
+        sub             r12, r0,  r1, lsl #2
+        vld1.8          {q8},  [r12,:128], r1 @ p3
+        vld1.8          {q12}, [r0, :128], r1 @ q0
+        vld1.8          {q9},  [r12,:128], r1 @ p2
+        vld1.8          {q13}, [r0, :128], r1 @ q1
+        vld1.8          {q10}, [r12,:128], r1 @ p1
+        vld1.8          {q14}, [r0, :128], r1 @ q2
+        vld1.8          {q11}, [r12,:128], r1 @ p0
+        vld1.8          {q15}, [r0, :128], r1 @ q3
+        sub             r0,  r0,  r1, lsl #2
+        sub             r12, r12, r1, lsl #1
+
+        loop_filter_q
+
+        vst1.8          {q10}, [r12,:128], r1
+        vst1.8          {q12}, [r0, :128], r1
+        vst1.8          {q11}, [r12,:128], r1
+        vst1.8          {q13}, [r0, :128], r1
+9:
+        vpop            {q4-q7}
+        bx              lr
+endfunc
+
+function ff_vp9_loop_filter_h_44_16_neon, export=1
+        vpush           {q4-q7}
+        sub             r12, r0,  #4
+        add             r0,  r12, r1, lsl #2
+        vld1.8          {d16}, [r12], r1
+        vld1.8          {d24}, [r0],  r1
+        vld1.8          {d18}, [r12], r1
+        vld1.8          {d26}, [r0],  r1
+        vld1.8          {d20}, [r12], r1
+        vld1.8          {d28}, [r0],  r1
+        vld1.8          {d22}, [r12], r1
+        vld1.8          {d30}, [r0],  r1
+        mov             r12, r0
+        add             r0,  r0,  r1, lsl #2
+        vld1.8          {d17}, [r12], r1
+        vld1.8          {d25}, [r0],  r1
+        vld1.8          {d19}, [r12], r1
+        vld1.8          {d27}, [r0],  r1
+        vld1.8          {d21}, [r12], r1
+        vld1.8          {d29}, [r0],  r1
+        vld1.8          {d23}, [r12], r1
+        vld1.8          {d31}, [r0],  r1
+
+        @ Transpose the 16x8 pixels, as two 8x8 parts
+        transpose_8x8   q8,  q9,  q10, q11, q12, q13, q14, q15
+
+        loop_filter_q
+
+        sub             r12, r0,  r1, lsl #4
+        add             r0,  r12, r1, lsl #3
+        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
+        @ outermost 2 pixels since they aren't changed.
+        add             r12, r12, #2
+        add             r0,  r0,  #2
+
+        @ We only will write the mid 4 pixels back; after the loop filter,
+        @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels).
+        @ We need to transpose them to columns, done with a 4x4 transpose
+        @ (which in practice is four 4x4 transposes of the 4x4 blocks of
+        @ the 16x4 pixels; into 4x16 pixels).
+        transpose_4x4   q10, q11, q12, q13
+
+        vst1.32         {d20[0]}, [r12], r1
+        vst1.32         {d21[0]}, [r0],  r1
+        vst1.32         {d22[0]}, [r12], r1
+        vst1.32         {d23[0]}, [r0],  r1
+        vst1.32         {d24[0]}, [r12], r1
+        vst1.32         {d25[0]}, [r0],  r1
+        vst1.32         {d26[0]}, [r12], r1
+        vst1.32         {d27[0]}, [r0],  r1
+        vst1.32         {d20[1]}, [r12], r1
+        vst1.32         {d21[1]}, [r0],  r1
+        vst1.32         {d22[1]}, [r12], r1
+        vst1.32         {d23[1]}, [r0],  r1
+        vst1.32         {d24[1]}, [r12], r1
+        vst1.32         {d25[1]}, [r0],  r1
+        vst1.32         {d26[1]}, [r12], r1
+        vst1.32         {d27[1]}, [r0],  r1
+9:
+        vpop            {q4-q7}
+        bx              lr
+endfunc
+
+function ff_vp9_loop_filter_v_8_8_neon, export=1
+        sub             r12, r0,  r1, lsl #2
+        vld1.8          {d20}, [r12,:64], r1 @ p3
+        vld1.8          {d24}, [r0, :64], r1 @ q0
+        vld1.8          {d21}, [r12,:64], r1 @ p2
+        vld1.8          {d25}, [r0, :64], r1 @ q1
+        vld1.8          {d22}, [r12,:64], r1 @ p1
+        vld1.8          {d26}, [r0, :64], r1 @ q2
+        vld1.8          {d23}, [r12,:64], r1 @ p0
+        vld1.8          {d27}, [r0, :64], r1 @ q3
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+        add             r12, r12, r1
+
+        loop_filter_8
+
+        vst1.8          {d21}, [r12,:64], r1
+        vst1.8          {d24}, [r0, :64], r1
+        vst1.8          {d22}, [r12,:64], r1
+        vst1.8          {d25}, [r0, :64], r1
+        vst1.8          {d23}, [r12,:64], r1
+        vst1.8          {d26}, [r0, :64], r1
+9:
+        bx              lr
+6:
+        sub             r12, r0,  r1, lsl #1
+        vst1.8          {d22}, [r12,:64], r1
+        vst1.8          {d24}, [r0, :64], r1
+        vst1.8          {d23}, [r12,:64], r1
+        vst1.8          {d25}, [r0, :64], r1
+        bx              lr
+endfunc
+
+function ff_vp9_loop_filter_h_8_8_neon, export=1
+        sub             r12, r0,  #4
+        add             r0,  r12, r1, lsl #2
+        vld1.8          {d20}, [r12], r1
+        vld1.8          {d24}, [r0],  r1
+        vld1.8          {d21}, [r12], r1
+        vld1.8          {d25}, [r0],  r1
+        vld1.8          {d22}, [r12], r1
+        vld1.8          {d26}, [r0],  r1
+        vld1.8          {d23}, [r12], r1
+        vld1.8          {d27}, [r0],  r1
+
+        sub             r12, r12, r1, lsl #2
+        sub             r0,  r0,  r1, lsl #2
+
+        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+        loop_filter_8
+
+        @ Even though only 6 pixels per row have been changed, we write the
+        @ full 8 pixel registers.
+        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+        vst1.8          {d20}, [r12], r1
+        vst1.8          {d24}, [r0],  r1
+        vst1.8          {d21}, [r12], r1
+        vst1.8          {d25}, [r0],  r1
+        vst1.8          {d22}, [r12], r1
+        vst1.8          {d26}, [r0],  r1
+        vst1.8          {d23}, [r12], r1
+        vst1.8          {d27}, [r0],  r1
+9:
+        bx              lr
+6:
+        @ If we didn't need to do the flat8in part, we use the same writeback
+        @ as in loop_filter_h_4_8.
+        add             r12, r12, #2
+        add             r0,  r0,  #2
+        transpose_q_4x4 q11, q12, d22, d23, d24, d25
+        vst1.32         {d22[0]}, [r12], r1
+        vst1.32         {d22[1]}, [r0],  r1
+        vst1.32         {d23[0]}, [r12], r1
+        vst1.32         {d23[1]}, [r0],  r1
+        vst1.32         {d24[0]}, [r12], r1
+        vst1.32         {d24[1]}, [r0],  r1
+        vst1.32         {d25[0]}, [r12], r1
+        vst1.32         {d25[1]}, [r0],  r1
+        bx              lr
+endfunc
+
+function vp9_loop_filter_v_16_neon
+        sub             r12, r0,  r1, lsl #3
+        @ Read p7-p0 using r12 and q0-q7 using r0
+        vld1.8          {d16}, [r12,:64], r1 @ p7
+        vld1.8          {d24}, [r0, :64], r1 @ q0
+        vld1.8          {d17}, [r12,:64], r1 @ p6
+        vld1.8          {d25}, [r0, :64], r1 @ q1
+        vld1.8          {d18}, [r12,:64], r1 @ p5
+        vld1.8          {d26}, [r0, :64], r1 @ q2
+        vld1.8          {d19}, [r12,:64], r1 @ p4
+        vld1.8          {d27}, [r0, :64], r1 @ q3
+        vld1.8          {d20}, [r12,:64], r1 @ p3
+        vld1.8          {d28}, [r0, :64], r1 @ q4
+        vld1.8          {d21}, [r12,:64], r1 @ p2
+        vld1.8          {d29}, [r0, :64], r1 @ q5
+        vld1.8          {d22}, [r12,:64], r1 @ p1
+        vld1.8          {d30}, [r0, :64], r1 @ q6
+        vld1.8          {d23}, [r12,:64], r1 @ p0
+        vld1.8          {d31}, [r0, :64], r1 @ q7
+        sub             r12, r12, r1, lsl #3
+        sub             r0,  r0,  r1, lsl #3
+        add             r12, r12, r1
+
+        loop_filter_16
+
+        @ If we did the flat8out part, we get the output in
+        @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
+        @ store d2-d9 there, and d10-d17 into r0.
+        vst1.8          {d2},  [r12,:64], r1
+        vst1.8          {d10}, [r0, :64], r1
+        vst1.8          {d3},  [r12,:64], r1
+        vst1.8          {d11}, [r0, :64], r1
+        vst1.8          {d4},  [r12,:64], r1
+        vst1.8          {d12}, [r0, :64], r1
+        vst1.8          {d5},  [r12,:64], r1
+        vst1.8          {d13}, [r0, :64], r1
+        vst1.8          {d6},  [r12,:64], r1
+        vst1.8          {d14}, [r0, :64], r1
+        vst1.8          {d8},  [r12,:64], r1
+        vst1.8          {d15}, [r0, :64], r1
+        vst1.8          {d9},  [r12,:64], r1
+        vst1.8          {d17}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  r1
+
+9:
+        bx              lr
+
+8:
+        add             r12, r12, r1, lsl #2
+        @ If we didn't do the flat8out part, the output is left in the
+        @ input registers.
+        vst1.8          {d21}, [r12,:64], r1
+        vst1.8          {d24}, [r0, :64], r1
+        vst1.8          {d22}, [r12,:64], r1
+        vst1.8          {d25}, [r0, :64], r1
+        vst1.8          {d23}, [r12,:64], r1
+        vst1.8          {d26}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #1
+        sub             r0,  r0,  r1
+        bx              lr
+7:
+        sub             r12, r0,  r1, lsl #1
+        vst1.8          {d22}, [r12,:64], r1
+        vst1.8          {d24}, [r0, :64], r1
+        vst1.8          {d23}, [r12,:64], r1
+        vst1.8          {d25}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #1
+        bx              lr
+endfunc
+
+function ff_vp9_loop_filter_v_16_8_neon, export=1
+        ldr             r12, [sp]
+        push            {lr}
+        vpush           {q4-q7}
+        push            {r12}
+        bl              vp9_loop_filter_v_16_neon
+        add             sp,  sp,  #4
+        vpop            {q4-q7}
+        pop             {pc}
+endfunc
+
+function ff_vp9_loop_filter_v_16_16_neon, export=1
+        ldr             r12, [sp]
+        // The filter clobbers r2 and r3, but we need to keep them for the second round
+        push            {r2, r3, lr}
+        vpush           {q4-q7}
+        push            {r12}
+        bl              vp9_loop_filter_v_16_neon
+        add             r0,  #8
+        ldr             r2,  [sp, #68]
+        ldr             r3,  [sp, #72]
+        bl              vp9_loop_filter_v_16_neon
+        add             sp,  sp,  #4
+        vpop            {q4-q7}
+        pop             {r2, r3, pc}
+endfunc
+
+function vp9_loop_filter_h_16_neon
+        sub             r12, r0,  #8
+        vld1.8          {d16}, [r12,:64], r1
+        vld1.8          {d24}, [r0, :64], r1
+        vld1.8          {d17}, [r12,:64], r1
+        vld1.8          {d25}, [r0, :64], r1
+        vld1.8          {d18}, [r12,:64], r1
+        vld1.8          {d26}, [r0, :64], r1
+        vld1.8          {d19}, [r12,:64], r1
+        vld1.8          {d27}, [r0, :64], r1
+        vld1.8          {d20}, [r12,:64], r1
+        vld1.8          {d28}, [r0, :64], r1
+        vld1.8          {d21}, [r12,:64], r1
+        vld1.8          {d29}, [r0, :64], r1
+        vld1.8          {d22}, [r12,:64], r1
+        vld1.8          {d30}, [r0, :64], r1
+        vld1.8          {d23}, [r12,:64], r1
+        vld1.8          {d31}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #3
+        sub             r12, r12, r1, lsl #3
+
+        @ The 16x8 pixels read above is in two 8x8 blocks; the left
+        @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes
+        @ of this, to get one column per register. This could be done with two
+        @ transpose_8x8 as below, but this takes advantage of the q registers.
+        transpose16_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15
+        vtrn.8          d16, d17
+        vtrn.8          d18, d19
+        vtrn.8          d20, d21
+        vtrn.8          d22, d23
+        vtrn.8          d24, d25
+        vtrn.8          d26, d27
+        vtrn.8          d28, d29
+        vtrn.8          d30, d31
+
+        loop_filter_16
+
+        @ Transpose back; this is the same transpose as above, but
+        @ we can't take advantage of q registers for the transpose, since
+        @ all d registers in the transpose aren't consecutive.
+        transpose_8x8   d16, d2,  d3,  d4,  d5,  d6,  d8,  d9
+        transpose_8x8   d10, d11, d12, d13, d14, d15, d17, d31
+
+        vst1.8          {d16}, [r12,:64], r1
+        vst1.8          {d10}, [r0, :64], r1
+
+        vst1.8          {d2},  [r12,:64], r1
+        vst1.8          {d11}, [r0, :64], r1
+
+        vst1.8          {d3},  [r12,:64], r1
+        vst1.8          {d12}, [r0, :64], r1
+
+        vst1.8          {d4},  [r12,:64], r1
+        vst1.8          {d13}, [r0, :64], r1
+
+        vst1.8          {d5},  [r12,:64], r1
+        vst1.8          {d14}, [r0, :64], r1
+
+        vst1.8          {d6},  [r12,:64], r1
+        vst1.8          {d15}, [r0, :64], r1
+
+        vst1.8          {d8},  [r12,:64], r1
+        vst1.8          {d17}, [r0, :64], r1
+
+        vst1.8          {d9},  [r12,:64], r1
+        vst1.8          {d31}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #3
+9:
+        bx              lr
+8:
+        @ The same writeback as in loop_filter_h_8_8
+        sub             r12, r0,  #4
+        add             r0,  r12, r1, lsl #2
+        transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
+
+        vst1.8          {d20}, [r12], r1
+        vst1.8          {d24}, [r0],  r1
+        vst1.8          {d21}, [r12], r1
+        vst1.8          {d25}, [r0],  r1
+        vst1.8          {d22}, [r12], r1
+        vst1.8          {d26}, [r0],  r1
+        vst1.8          {d23}, [r12], r1
+        vst1.8          {d27}, [r0],  r1
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #4
+        bx              lr
+7:
+        @ The same writeback as in loop_filter_h_4_8
+        sub             r12, r0,  #2
+        add             r0,  r12, r1, lsl #2
+        transpose_q_4x4 q11, q12, d22, d23, d24, d25
+        vst1.32         {d22[0]}, [r12], r1
+        vst1.32         {d22[1]}, [r0],  r1
+        vst1.32         {d23[0]}, [r12], r1
+        vst1.32         {d23[1]}, [r0],  r1
+        vst1.32         {d24[0]}, [r12], r1
+        vst1.32         {d24[1]}, [r0],  r1
+        vst1.32         {d25[0]}, [r12], r1
+        vst1.32         {d25[1]}, [r0],  r1
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #2
+        bx              lr
+endfunc
+
+function ff_vp9_loop_filter_h_16_8_neon, export=1
+        ldr             r12, [sp]
+        push            {lr}
+        vpush           {q4-q7}
+        push            {r12}
+        bl              vp9_loop_filter_h_16_neon
+        add             sp,  sp,  #4
+        vpop            {q4-q7}
+        pop             {pc}
+endfunc
+
+function ff_vp9_loop_filter_h_16_16_neon, export=1
+        ldr             r12, [sp]
+        // The filter clobbers r2 and r3, but we need to keep them for the second round
+        push            {r2, r3, lr}
+        vpush           {q4-q7}
+        push            {r12}
+        bl              vp9_loop_filter_h_16_neon
+        add             r0,  r0,  r1, lsl #3
+        ldr             r2,  [sp, #68]
+        ldr             r3,  [sp, #72]
+        bl              vp9_loop_filter_h_16_neon
+        add             sp,  sp,  #4
+        vpop            {q4-q7}
+        pop             {r2, r3, pc}
+endfunc
diff --git a/libavcodec/arm/vp9mc_16bpp_neon.S b/libavcodec/arm/vp9mc_16bpp_neon.S
new file mode 100644
index 0000000000..f6ec0375f2
--- /dev/null
+++ b/libavcodec/arm/vp9mc_16bpp_neon.S
@@ -0,0 +1,615 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ All public functions in this file have the following signature:
+@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+@                            const uint8_t *ref, ptrdiff_t ref_stride,
+@                            int h, int mx, int my);
+
+function ff_vp9_copy128_neon, export=1
+        ldr             r12, [sp]
+        sub             r1,  r1,  #96
+        sub             r3,  r3,  #96
+1:
+        subs            r12, r12, #1
+        vld1.16         {q0,  q1},  [r2]!
+        vst1.16         {q0,  q1},  [r0, :128]!
+        vld1.16         {q2,  q3},  [r2]!
+        vst1.16         {q2,  q3},  [r0, :128]!
+        vld1.16         {q8,  q9},  [r2]!
+        vst1.16         {q8,  q9},  [r0, :128]!
+        vld1.16         {q10, q11}, [r2], r3
+        vst1.16         {q10, q11}, [r0, :128], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg64_16_neon, export=1
+        push            {lr}
+        ldr             r12, [sp, #4]
+        sub             r1,  r1,  #96
+        sub             r3,  r3,  #96
+        mov             lr,  r0
+1:
+        subs            r12, r12, #1
+        vld1.16         {q8,  q9},  [r2]!
+        vld1.16         {q0,  q1},  [r0, :128]!
+        vld1.16         {q10, q11}, [r2]!
+        vrhadd.u16      q0,  q0,  q8
+        vld1.16         {q2,  q3},  [r0, :128]!
+        vrhadd.u16      q1,  q1,  q9
+        vld1.16         {q12, q13}, [r2]!
+        vrhadd.u16      q2,  q2,  q10
+        vst1.16         {q0,  q1},  [lr, :128]!
+        vrhadd.u16      q3,  q3,  q11
+        vld1.16         {q8,  q9},  [r0, :128]!
+        vst1.16         {q2,  q3},  [lr, :128]!
+        vrhadd.u16      q8,  q8,  q12
+        vld1.16         {q14, q15}, [r2], r3
+        vrhadd.u16      q9,  q9,  q13
+        vld1.16         {q10, q11}, [r0, :128], r1
+        vrhadd.u16      q10, q10, q14
+        vst1.16         {q8,  q9},  [lr, :128]!
+        vrhadd.u16      q11, q11, q15
+        vst1.16         {q10, q11}, [lr, :128], r1
+        bne             1b
+        pop             {pc}
+endfunc
+
+function ff_vp9_avg32_16_neon, export=1
+        push            {lr}
+        ldr             r12, [sp, #4]
+        sub             r1,  r1,  #32
+        sub             r3,  r3,  #32
+        mov             lr,  r0
+1:
+        subs            r12, r12, #1
+        vld1.16         {q8,  q9},  [r2]!
+        vld1.16         {q0,  q1},  [r0, :128]!
+        vld1.16         {q10, q11}, [r2], r3
+        vrhadd.u16      q0,  q0,  q8
+        vld1.16         {q2,  q3},  [r0, :128], r1
+        vrhadd.u16      q1,  q1,  q9
+        vrhadd.u16      q2,  q2,  q10
+        vst1.16         {q0, q1},  [lr, :128]!
+        vrhadd.u16      q3,  q3,  q11
+        vst1.16         {q2, q3},  [lr, :128], r1
+        bne             1b
+        pop             {pc}
+endfunc
+
+function ff_vp9_avg16_16_neon, export=1
+        ldr             r12, [sp]
+1:
+        subs            r12, r12, #1
+        vld1.16         {q2,  q3},  [r2], r3
+        vld1.16         {q0,  q1},  [r0, :128]
+        vrhadd.u16      q0,  q0,  q2
+        vrhadd.u16      q1,  q1,  q3
+        vst1.16         {q0,  q1},  [r0, :128], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg8_16_neon, export=1
+        push            {lr}
+        ldr             r12, [sp, #4]
+        mov             lr,  r0
+1:
+        subs            r12, r12, #2
+        vld1.16         {q2},  [r2], r3
+        vld1.16         {q0},  [r0, :128], r1
+        vld1.16         {q3},  [r2], r3
+        vrhadd.u16      q0,  q0,  q2
+        vld1.16         {q1},  [r0, :128], r1
+        vrhadd.u16      q1,  q1,  q3
+        vst1.16         {q0},  [lr, :128], r1
+        vst1.16         {q1},  [lr, :128], r1
+        bne             1b
+        pop             {pc}
+endfunc
+
+function ff_vp9_avg4_16_neon, export=1
+        ldr             r12, [sp]
+1:
+        subs            r12, r12, #2
+        vld1.16         {d2},  [r2], r3
+        vld1.16         {d0},  [r0, :64], r1
+        vld1.16         {d3},  [r2], r3
+        vrhadd.u16      d0,  d0,  d2
+        vld1.16         {d1},  [r0, :64]
+        sub             r0,  r0,  r1
+        vrhadd.u16      d1,  d1,  d3
+        vst1.16         {d0},  [r0, :64], r1
+        vst1.16         {d1},  [r0, :64], r1
+        bne             1b
+        bx              lr
+endfunc
+
+@ Helper macros for vmull/vmlal with a constant from either d0 or d1 depending on index
+.macro vmull_lane dst, src, idx
+.if \idx < 4
+       vmull.s16        \dst, \src, d0[\idx]
+.else
+       vmull.s16        \dst, \src, d1[\idx - 4]
+.endif
+.endm
+.macro vmlal_lane dst, src, idx
+.if \idx < 4
+       vmlal.s16        \dst, \src, d0[\idx]
+.else
+       vmlal.s16        \dst, \src, d1[\idx - 4]
+.endif
+.endm
+
+@ Extract a vector from src1-src2 and src3-src4, andmultiply-accumulate
+@ into dst1 and dst3 (or dst1-dst2 and dst3-dst4 for size >= 8)
+.macro extmlal dst1, dst2, dst3, dst4, src1, src2, src3, src4, offset, size
+        vext.8          q14, \src1, \src2, #(2*\offset)
+        vext.8          q15, \src3, \src4, #(2*\offset)
+        vmlal_lane      \dst1,  d28, \offset
+        vmlal_lane      \dst3,  d30, \offset
+.if \size >= 8
+        vmlal_lane      \dst2,  d29, \offset
+        vmlal_lane      \dst4,  d31, \offset
+.endif
+.endm
+
+
+@ Instantiate a horizontal filter function for the given size.
+@ This can work on 4 or 8 pixels in parallel; for larger
+@ widths it will do 8 pixels at a time and loop horizontally.
+@ The actual width (in bytes) is passed in r5, the height in r4 and
+@ the filter coefficients in r12.
+.macro do_8tap_h type, size
+function \type\()_8tap_\size\()h
+        sub             r2,  r2,  #6
+        add             r6,  r0,  r1
+        add             r7,  r2,  r3
+        add             r1,  r1,  r1
+        add             r3,  r3,  r3
+        @ Only size >= 8 loops horizontally and needs
+        @ reduced dst stride
+.if \size >= 8
+        sub             r1,  r1,  r5
+.endif
+        @ size >= 8 loads two qwords and increments r2,
+        @ for size 4 it's enough with three dwords and no
+        @ postincrement
+.if \size >= 8
+        sub             r3,  r3,  r5
+        sub             r3,  r3,  #16
+.endif
+        @ Load the filter vector
+        vld1.16         {q0},  [r12,:128]
+1:
+.if \size >= 8
+        mov             r12, r5
+.endif
+        @ Load src
+.if \size >= 8
+        vld1.16         {q8,  q9},  [r2]!
+        vld1.16         {q10, q11}, [r7]!
+.else
+        vld1.16         {d16, d17, d18}, [r2]
+        vld1.16         {d20, d21, d22}, [r7]
+.endif
+2:
+
+        vmull.s16       q1,  d16, d0[0]
+        vmull.s16       q12, d20, d0[0]
+.if \size >= 8
+        vmull.s16       q2,  d17, d0[0]
+        vmull.s16       q13, d21, d0[0]
+.endif
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 1, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 2, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 3, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 4, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 5, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 6, \size
+        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 7, \size
+
+        @ Round, shift and saturate.
+        @ The vqrshrun takes care of clamping negative values to zero, but
+        @ we manually need to do vmin with the max pixel value.
+        vqrshrun.s32    d2,  q1,  #7
+        vqrshrun.s32    d24, q12, #7
+.if \size >= 8
+        vqrshrun.s32    d3,  q2,  #7
+        vqrshrun.s32    d25, q13, #7
+        vmin.u16        q1,  q1,  q3
+        vmin.u16        q12, q12, q3
+.else
+        vmin.u16        d2,  d2,  d6
+        vmin.u16        d24, d24, d6
+.endif
+        @ Average
+.ifc \type,avg
+.if \size >= 8
+        vld1.16         {q14}, [r0,:128]
+        vld1.16         {q15}, [r6,:128]
+        vrhadd.u16      q1,  q1,  q14
+        vrhadd.u16      q12, q12, q15
+.else
+        vld1.16         {d28}, [r0,:64]
+        vld1.16         {d30}, [r6,:64]
+        vrhadd.u16      d2,  d2,  d28
+        vrhadd.u16      d24, d24, d30
+.endif
+.endif
+        @ Store and loop horizontally (for size >= 8)
+.if \size >= 8
+        subs            r12, r12, #16
+        vst1.16         {q1},  [r0,:128]!
+        vst1.16         {q12}, [r6,:128]!
+        beq             3f
+        vmov            q8,  q9
+        vmov            q10, q11
+        vld1.16         {q9},  [r2]!
+        vld1.16         {q11}, [r7]!
+        b               2b
+.else @ \size == 4
+        vst1.16         {d2},  [r0,:64]
+        vst1.16         {d24}, [r6,:64]
+.endif
+3:
+        @ Loop vertically
+        add             r0,  r0,  r1
+        add             r6,  r6,  r1
+        add             r2,  r2,  r3
+        add             r7,  r7,  r3
+        subs            r4,  r4,  #2
+        bne             1b
+        pop             {r4-r7}
+        bx              lr
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size
+do_8tap_h avg, \size
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+
+.macro do_8tap_h_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
+        push            {r4-r7}
+        ldr             r4,  [sp, #16]
+        ldr             r5,  [sp, #20]
+        vmvn.u16        q3,  #((0xffff << \bpp) & 0xffff)
+        movrelx         r12, X(ff_vp9_subpel_filters), r6
+        add             r12, r12, 256*\offset
+        add             r12, r12, r5, lsl #4
+        mov             r5,  #2*\size
+.if \size >= 8
+        b               \type\()_8tap_8h
+.else
+        b               \type\()_8tap_4h
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size, bpp
+do_8tap_h_func put, regular, 1, \size, \bpp
+do_8tap_h_func avg, regular, 1, \size, \bpp
+do_8tap_h_func put, sharp,   2, \size, \bpp
+do_8tap_h_func avg, sharp,   2, \size, \bpp
+do_8tap_h_func put, smooth,  0, \size, \bpp
+do_8tap_h_func avg, smooth,  0, \size, \bpp
+.endm
+
+.macro do_8tap_h_filters_bpp bpp
+do_8tap_h_filters 64, \bpp
+do_8tap_h_filters 32, \bpp
+do_8tap_h_filters 16, \bpp
+do_8tap_h_filters 8,  \bpp
+do_8tap_h_filters 4,  \bpp
+.endm
+
+do_8tap_h_filters_bpp 10
+do_8tap_h_filters_bpp 12
+
+.ltorg
+
+@ Vertical filters
+
+@ Round, shift and saturate and store qreg1-4
+.macro do_store4 qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, minreg, type
+        vqrshrun.s32    \dreg1,  \qreg1, #7
+        vqrshrun.s32    \dreg2,  \qreg2, #7
+        vqrshrun.s32    \dreg3,  \qreg3, #7
+        vqrshrun.s32    \dreg4,  \qreg4, #7
+.ifc \type,avg
+        vld1.16         {\tmp1},  [r6,:64], r1
+        vld1.16         {\tmp2},  [r6,:64], r1
+        vld1.16         {\tmp3},  [r6,:64], r1
+        vld1.16         {\tmp4},  [r6,:64], r1
+.endif
+        vmin.u16        \dreg1,  \dreg1,  \minreg
+        vmin.u16        \dreg2,  \dreg2,  \minreg
+        vmin.u16        \dreg3,  \dreg3,  \minreg
+        vmin.u16        \dreg4,  \dreg4,  \minreg
+.ifc \type,avg
+        vrhadd.u16      \dreg1,  \dreg1,  \tmp1
+        vrhadd.u16      \dreg2,  \dreg2,  \tmp2
+        vrhadd.u16      \dreg3,  \dreg3,  \tmp3
+        vrhadd.u16      \dreg4,  \dreg4,  \tmp4
+.endif
+        vst1.16         {\dreg1}, [r0,:64], r1
+        vst1.16         {\dreg2}, [r0,:64], r1
+        vst1.16         {\dreg3}, [r0,:64], r1
+        vst1.16         {\dreg4}, [r0,:64], r1
+.endm
+
+@ Round, shift and saturate and store qreg1-4
+@ qreg1-2 belong to one line and qreg3-4 to the second line.
+@ dreg1-2 == qreg1, dreg3-4 == qreg2.
+.macro do_store8 qreg1, qreg2, qreg3, qreg4, dreg1, dreg2, dreg3, dreg4, minreg, type
+        vqrshrun.s32    \dreg1,  \qreg1, #7
+        vqrshrun.s32    \dreg2,  \qreg2, #7
+        vqrshrun.s32    \dreg3,  \qreg3, #7
+        vqrshrun.s32    \dreg4,  \qreg4, #7
+.ifc \type,avg
+        vld1.16         {\qreg3},  [r6,:128], r1
+        vld1.16         {\qreg4},  [r6,:128], r1
+.endif
+        vmin.u16        \qreg1,  \qreg1,  \minreg
+        vmin.u16        \qreg2,  \qreg2,  \minreg
+.ifc \type,avg
+        vrhadd.u16      \qreg1,  \qreg1,  \qreg3
+        vrhadd.u16      \qreg2,  \qreg2,  \qreg4
+.endif
+        vst1.16         {\qreg1}, [r0,:128], r1
+        vst1.16         {\qreg2}, [r0,:128], r1
+.endm
+
+@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+@ (src1-src8 into dst1, src2-src9 into dst2).
+.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
+        vmull.s16       \dst1, \src1, d0[0]
+        vmull.s16       \dst2, \src2, d0[0]
+        vmull.s16       \tmp1, \src2, d0[1]
+        vmull.s16       \tmp2, \src3, d0[1]
+        vmlal.s16       \dst1, \src3, d0[2]
+        vmlal.s16       \dst2, \src4, d0[2]
+        vmlal.s16       \tmp1, \src4, d0[3]
+        vmlal.s16       \tmp2, \src5, d0[3]
+        vmlal.s16       \dst1, \src5, d1[0]
+        vmlal.s16       \dst2, \src6, d1[0]
+        vmlal.s16       \tmp1, \src6, d1[1]
+        vmlal.s16       \tmp2, \src7, d1[1]
+        vmlal.s16       \dst1, \src7, d1[2]
+        vmlal.s16       \dst2, \src8, d1[2]
+        vmlal.s16       \tmp1, \src8, d1[3]
+        vmlal.s16       \tmp2, \src9, d1[3]
+        vadd.s32        \dst1, \dst1, \tmp1
+        vadd.s32        \dst2, \dst2, \tmp2
+.endm
+
+@ Evaluate the filter twice in parallel. This does the same as convolve4 above,
+@ but with double width (two input/output registers per row).
+.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15, src16, src17, src18
+        vmull.s16       \dst1, \src1,  d0[0]
+        vmull.s16       \dst2, \src2,  d0[0]
+        vmull.s16       \dst3, \src3,  d0[0]
+        vmull.s16       \dst4, \src4,  d0[0]
+        vmlal.s16       \dst1, \src3,  d0[1]
+        vmlal.s16       \dst2, \src4,  d0[1]
+        vmlal.s16       \dst3, \src5,  d0[1]
+        vmlal.s16       \dst4, \src6,  d0[1]
+        vmlal.s16       \dst1, \src5,  d0[2]
+        vmlal.s16       \dst2, \src6,  d0[2]
+        vmlal.s16       \dst3, \src7,  d0[2]
+        vmlal.s16       \dst4, \src8,  d0[2]
+        vmlal.s16       \dst1, \src7,  d0[3]
+        vmlal.s16       \dst2, \src8,  d0[3]
+        vmlal.s16       \dst3, \src9,  d0[3]
+        vmlal.s16       \dst4, \src10, d0[3]
+        vmlal.s16       \dst1, \src9,  d1[0]
+        vmlal.s16       \dst2, \src10, d1[0]
+        vmlal.s16       \dst3, \src11, d1[0]
+        vmlal.s16       \dst4, \src12, d1[0]
+        vmlal.s16       \dst1, \src11, d1[1]
+        vmlal.s16       \dst2, \src12, d1[1]
+        vmlal.s16       \dst3, \src13, d1[1]
+        vmlal.s16       \dst4, \src14, d1[1]
+        vmlal.s16       \dst1, \src13, d1[2]
+        vmlal.s16       \dst2, \src14, d1[2]
+        vmlal.s16       \dst3, \src15, d1[2]
+        vmlal.s16       \dst4, \src16, d1[2]
+        vmlal.s16       \dst1, \src15, d1[3]
+        vmlal.s16       \dst2, \src16, d1[3]
+        vmlal.s16       \dst3, \src17, d1[3]
+        vmlal.s16       \dst4, \src18, d1[3]
+.endm
+
+@ Instantiate a vertical filter function for filtering 8 pixels at a time.
+@ The height is passed in r4, the width in r5 and the filter coefficients
+@ in r12.
+.macro do_8tap_8v type
+function \type\()_8tap_8v
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        vld1.16         {q0},  [r12, :128]
+1:
+.ifc \type,avg
+        mov             r6,  r0
+.endif
+        mov             r12, r4
+
+        vld1.16         {q5},  [r2], r3
+        vld1.16         {q6},  [r2], r3
+        vld1.16         {q7},  [r2], r3
+        vld1.16         {q8},  [r2], r3
+        vld1.16         {q9},  [r2], r3
+        vld1.16         {q10}, [r2], r3
+        vld1.16         {q11}, [r2], r3
+2:
+        vld1.16         {q12}, [r2], r3
+        vld1.16         {q13}, [r2], r3
+        vld1.16         {q14}, [r2], r3
+        vld1.16         {q15}, [r2], r3
+        convolve8       q2,  q3,  q4,  q5,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27
+        do_store8       q2,  q3,  q4,  q5,  d4,  d5,  d6,  d7,  q1,  \type
+        convolve8       q2,  q3,  q4,  q5,  d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        do_store8       q2,  q3,  q4,  q5,  d4,  d5,  d6,  d7,  q1,  \type
+
+        subs            r12, r12, #4
+        beq             8f
+
+        vld1.16         {q4},  [r2], r3
+        vld1.16         {q5},  [r2], r3
+        vld1.16         {q6},  [r2], r3
+        vld1.16         {q7},  [r2], r3
+        convolve8       q2,  q3,  q8,  q9,  d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11
+        do_store8       q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  q1,  \type
+        convolve8       q2,  q3,  q8,  q9,  d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15
+        do_store8       q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  q1,  \type
+
+        subs            r12, r12, #4
+        beq             8f
+
+        vld1.16         {q8},  [r2], r3
+        vld1.16         {q9},  [r2], r3
+        vld1.16         {q10}, [r2], r3
+        vld1.16         {q11}, [r2], r3
+        convolve8       q2,  q3,  q12, q13, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19
+        do_store8       q2,  q3,  q12, q13, d4,  d5,  d6,  d7,  q1,  \type
+        convolve8       q2,  q3,  q12, q13, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23
+        do_store8       q2,  q3,  q12, q13, d4,  d5,  d6,  d7,  q1,  \type
+
+        subs            r12, r12, #4
+        bne             2b
+
+8:
+        subs            r5,  r5,  #8
+        beq             9f
+        @ r0 -= h * dst_stride
+        mls             r0,  r1,  r4, r0
+        @ r2 -= h * src_stride
+        mls             r2,  r3,  r4, r2
+        @ r2 -= 8 * src_stride
+        sub             r2,  r2,  r3, lsl #3
+        @ r2 += 1 * src_stride
+        add             r2,  r2,  r3
+        add             r2,  r2,  #16
+        add             r0,  r0,  #16
+        b               1b
+9:
+        vpop            {q4-q7}
+        pop             {r4-r6}
+        bx              lr
+endfunc
+.endm
+
+do_8tap_8v put
+do_8tap_8v avg
+
+@ Instantiate a vertical filter function for filtering a 4 pixels wide
+@ slice. This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type
+function \type\()_8tap_4v
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        vld1.16         {q0},  [r12, :128]
+.ifc \type,avg
+        mov             r6,  r0
+.endif
+
+        vld1.16         {d16}, [r2], r3
+        vld1.16         {d17}, [r2], r3
+        vld1.16         {d18}, [r2], r3
+        vld1.16         {d19}, [r2], r3
+        vld1.16         {d20}, [r2], r3
+        vld1.16         {d21}, [r2], r3
+        vld1.16         {d22}, [r2], r3
+        vld1.16         {d23}, [r2], r3
+        vld1.16         {d24}, [r2], r3
+        vld1.16         {d25}, [r2], r3
+        vld1.16         {d26}, [r2], r3
+        convolve4       q2,  q3,  d16, d17, d18, d19, d20, d21, d22, d23, d24, q14, q15
+        convolve4       q14, q15, d18, d19, d20, d21, d22, d23, d24, d25, d26, q8,  q9
+        do_store4       q2,  d4,  q3,  d6,  q14, d28, q15, d30, d5,  d7,  d29, d31, d2,  \type
+
+        subs            r4,  r4,  #4
+        beq             9f
+
+        vld1.16         {d27}, [r2], r3
+        vld1.16         {d28}, [r2], r3
+        vld1.16         {d29}, [r2], r3
+        vld1.16         {d30}, [r2], r3
+        convolve4       q2,  q3,  d20, d21, d22, d23, d24, d25, d26, d27, d28, q8,  q9
+        convolve4       q8,  q9,  d22, d23, d24, d25, d26, d27, d28, d29, d30, q10, q11
+        do_store4       q2,  d4,  q3,  d6,  q8,  d16, q9,  d18, d5,  d7,  d17, d19, d2,  \type
+
+9:
+        pop             {r4-r6}
+        bx              lr
+endfunc
+.endm
+
+do_8tap_4v put
+do_8tap_4v avg
+
+.macro do_8tap_v_func type, filter, offset, size, bpp
+function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
+        push            {r4-r6}
+        ldr             r4,  [sp, #12]
+        ldr             r5,  [sp, #20]
+.if \size >= 8
+        vpush           {q4-q7}
+.endif
+        vmvn.u16        q1,  #((0xffff << \bpp) & 0xffff)
+        movrelx         r12, X(ff_vp9_subpel_filters), r6
+        add             r12, r12, 256*\offset
+        add             r12, r12, r5, lsl #4
+        mov             r5,  #\size
+.if \size >= 8
+        b               \type\()_8tap_8v
+.else
+        b               \type\()_8tap_4v
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size, bpp
+do_8tap_v_func put, regular, 1, \size, \bpp
+do_8tap_v_func avg, regular, 1, \size, \bpp
+do_8tap_v_func put, sharp,   2, \size, \bpp
+do_8tap_v_func avg, sharp,   2, \size, \bpp
+do_8tap_v_func put, smooth,  0, \size, \bpp
+do_8tap_v_func avg, smooth,  0, \size, \bpp
+.endm
+
+.macro do_8tap_v_filters_bpp bpp
+do_8tap_v_filters 64, \bpp
+do_8tap_v_filters 32, \bpp
+do_8tap_v_filters 16, \bpp
+do_8tap_v_filters 8,  \bpp
+do_8tap_v_filters 4,  \bpp
+.endm
+
+do_8tap_v_filters_bpp 10
+do_8tap_v_filters_bpp 12
diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
new file mode 100644
index 0000000000..bd8cda7c30
--- /dev/null
+++ b/libavcodec/arm/vp9mc_neon.S
@@ -0,0 +1,720 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ All public functions in this file have the following signature:
+@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+@                            const uint8_t *ref, ptrdiff_t ref_stride,
+@                            int h, int mx, int my);
+
+function ff_vp9_copy64_neon, export=1
+        ldr             r12, [sp]
+        sub             r1,  r1,  #32
+        sub             r3,  r3,  #32
+1:
+        vld1.8          {q0,  q1},  [r2]!
+        vst1.8          {q0,  q1},  [r0, :128]!
+        vld1.8          {q2,  q3},  [r2], r3
+        subs            r12, r12, #1
+        vst1.8          {q2,  q3},  [r0, :128], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg64_neon, export=1
+        push            {lr}
+        ldr             r12, [sp, #4]
+        sub             r1,  r1,  #32
+        sub             r3,  r3,  #32
+        mov             lr,  r0
+1:
+        vld1.8          {q8,  q9},  [r2]!
+        vld1.8          {q0,  q1},  [r0, :128]!
+        vld1.8          {q10, q11}, [r2], r3
+        vrhadd.u8       q0,  q0,  q8
+        vld1.8          {q2,  q3},  [r0, :128], r1
+        vrhadd.u8       q1,  q1,  q9
+        vrhadd.u8       q2,  q2,  q10
+        vst1.8          {q0,  q1},  [lr, :128]!
+        vrhadd.u8       q3,  q3,  q11
+        vst1.8          {q2,  q3},  [lr, :128], r1
+        subs            r12, r12, #1
+        bne             1b
+        pop             {pc}
+endfunc
+
+function ff_vp9_copy32_neon, export=1
+        ldr             r12, [sp]
+1:
+        vld1.8          {q0,  q1},  [r2], r3
+        subs            r12, r12, #1
+        vst1.8          {q0,  q1},  [r0, :128], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg32_neon, export=1
+        ldr             r12, [sp]
+1:
+        vld1.8          {q2,  q3},  [r2], r3
+        vld1.8          {q0,  q1},  [r0, :128]
+        vrhadd.u8       q0,  q0,  q2
+        vrhadd.u8       q1,  q1,  q3
+        subs            r12, r12, #1
+        vst1.8          {q0,  q1},  [r0, :128], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_copy16_neon, export=1
+        push            {r4,lr}
+        ldr             r12, [sp, #8]
+        add             r4,  r0,  r1
+        add             lr,  r2,  r3
+        add             r1,  r1,  r1
+        add             r3,  r3,  r3
+1:
+        vld1.8          {q0},  [r2], r3
+        vld1.8          {q1},  [lr], r3
+        subs            r12, r12, #2
+        vst1.8          {q0},  [r0, :128], r1
+        vst1.8          {q1},  [r4, :128], r1
+        bne             1b
+        pop             {r4,pc}
+endfunc
+
+function ff_vp9_avg16_neon, export=1
+        push            {lr}
+        ldr             r12, [sp, #4]
+        mov             lr,  r0
+1:
+        vld1.8          {q2},  [r2], r3
+        vld1.8          {q0},  [r0, :128], r1
+        vld1.8          {q3},  [r2], r3
+        vrhadd.u8       q0,  q0,  q2
+        vld1.8          {q1},  [r0, :128], r1
+        vrhadd.u8       q1,  q1,  q3
+        subs            r12, r12, #2
+        vst1.8          {q0},  [lr, :128], r1
+        vst1.8          {q1},  [lr, :128], r1
+        bne             1b
+        pop             {pc}
+endfunc
+
+function ff_vp9_copy8_neon, export=1
+        ldr             r12, [sp]
+1:
+        vld1.8          {d0},  [r2], r3
+        vld1.8          {d1},  [r2], r3
+        subs            r12, r12, #2
+        vst1.8          {d0},  [r0, :64], r1
+        vst1.8          {d1},  [r0, :64], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg8_neon, export=1
+        ldr             r12, [sp]
+1:
+        vld1.8          {d2},  [r2], r3
+        vld1.8          {d0},  [r0, :64], r1
+        vld1.8          {d3},  [r2], r3
+        vrhadd.u8       d0,  d0,  d2
+        vld1.8          {d1},  [r0, :64]
+        sub             r0,  r0,  r1
+        vrhadd.u8       d1,  d1,  d3
+        subs            r12, r12, #2
+        vst1.8          {d0},  [r0, :64], r1
+        vst1.8          {d1},  [r0, :64], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_copy4_neon, export=1
+        ldr             r12, [sp]
+1:
+        vld1.32         {d0[]},   [r2], r3
+        vld1.32         {d1[]},   [r2], r3
+        vst1.32         {d0[0]},  [r0, :32], r1
+        vld1.32         {d2[]},   [r2], r3
+        vst1.32         {d1[0]},  [r0, :32], r1
+        vld1.32         {d3[]},   [r2], r3
+        subs            r12, r12, #4
+        vst1.32         {d2[0]},  [r0, :32], r1
+        vst1.32         {d3[0]},  [r0, :32], r1
+        bne             1b
+        bx              lr
+endfunc
+
+function ff_vp9_avg4_neon, export=1
+        push            {lr}
+        ldr             r12, [sp, #4]
+        mov             lr,  r0
+1:
+        vld1.32         {d4[]},   [r2], r3
+        vld1.32         {d0[]},   [r0, :32], r1
+        vld1.32         {d5[]},   [r2], r3
+        vrhadd.u8       d0,  d0,  d4
+        vld1.32         {d1[]},   [r0, :32], r1
+        vld1.32         {d6[]},   [r2], r3
+        vrhadd.u8       d1,  d1,  d5
+        vld1.32         {d2[]},   [r0, :32], r1
+        vld1.32         {d7[]},   [r2], r3
+        vrhadd.u8       d2,  d2,  d6
+        vld1.32         {d3[]},   [r0, :32], r1
+        subs            r12, r12, #4
+        vst1.32         {d0[0]},  [lr, :32], r1
+        vrhadd.u8       d3,  d3,  d7
+        vst1.32         {d1[0]},  [lr, :32], r1
+        vst1.32         {d2[0]},  [lr, :32], r1
+        vst1.32         {d3[0]},  [lr, :32], r1
+        bne             1b
+        pop             {pc}
+endfunc
+
+@ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index
+.macro vmul_lane dst, src, idx
+.if \idx < 4
+       vmul.s16         \dst, \src, d0[\idx]
+.else
+       vmul.s16         \dst, \src, d1[\idx - 4]
+.endif
+.endm
+.macro vmla_lane dst, src, idx
+.if \idx < 4
+       vmla.s16         \dst, \src, d0[\idx]
+.else
+       vmla.s16         \dst, \src, d1[\idx - 4]
+.endif
+.endm
+
+@ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
+@ for size >= 16), and multiply-accumulate into dst1 and dst3 (or
+@ dst1-dst2 and dst3-dst4 for size >= 16)
+.macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
+        vext.8          q14, \src1, \src2, #(2*\offset)
+        vext.8          q15, \src4, \src5, #(2*\offset)
+.if \size >= 16
+        vmla_lane       \dst1,  q14, \offset
+        vext.8          q5,  \src2, \src3, #(2*\offset)
+        vmla_lane       \dst3,  q15, \offset
+        vext.8          q6,  \src5, \src6, #(2*\offset)
+        vmla_lane       \dst2,  q5,  \offset
+        vmla_lane       \dst4,  q6,  \offset
+.elseif \size == 8
+        vmla_lane       \dst1,  q14, \offset
+        vmla_lane       \dst3,  q15, \offset
+.else
+        vmla_lane       \dst1d, d28, \offset
+        vmla_lane       \dst3d, d30, \offset
+.endif
+.endm
+@ The same as above, but don't accumulate straight into the
+@ destination, but use a temp register and accumulate with saturation.
+.macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
+        vext.8          q14, \src1, \src2, #(2*\offset)
+        vext.8          q15, \src4, \src5, #(2*\offset)
+.if \size >= 16
+        vmul_lane       q14, q14, \offset
+        vext.8          q5,  \src2, \src3, #(2*\offset)
+        vmul_lane       q15, q15, \offset
+        vext.8          q6,  \src5, \src6, #(2*\offset)
+        vmul_lane       q5,  q5,  \offset
+        vmul_lane       q6,  q6,  \offset
+.elseif \size == 8
+        vmul_lane       q14, q14, \offset
+        vmul_lane       q15, q15, \offset
+.else
+        vmul_lane       d28, d28, \offset
+        vmul_lane       d30, d30, \offset
+.endif
+.if \size == 4
+        vqadd.s16       \dst1d, \dst1d, d28
+        vqadd.s16       \dst3d, \dst3d, d30
+.else
+        vqadd.s16       \dst1,  \dst1,  q14
+        vqadd.s16       \dst3,  \dst3,  q15
+.if \size >= 16
+        vqadd.s16       \dst2,  \dst2,  q5
+        vqadd.s16       \dst4,  \dst4,  q6
+.endif
+.endif
+.endm
+
+
+@ Instantiate a horizontal filter function for the given size.
+@ This can work on 4, 8 or 16 pixels in parallel; for larger
+@ widths it will do 16 pixels at a time and loop horizontally.
+@ The actual width is passed in r5, the height in r4 and
+@ the filter coefficients in r12. idx2 is the index of the largest
+@ filter coefficient (3 or 4) and idx1 is the other one of them.
+.macro do_8tap_h type, size, idx1, idx2
+function \type\()_8tap_\size\()h_\idx1\idx2
+        sub             r2,  r2,  #3
+        add             r6,  r0,  r1
+        add             r7,  r2,  r3
+        add             r1,  r1,  r1
+        add             r3,  r3,  r3
+        @ Only size >= 16 loops horizontally and needs
+        @ reduced dst stride
+.if \size >= 16
+        sub             r1,  r1,  r5
+.endif
+        @ size >= 16 loads two qwords and increments r2,
+        @ for size 4/8 it's enough with one qword and no
+        @ postincrement
+.if \size >= 16
+        sub             r3,  r3,  r5
+        sub             r3,  r3,  #8
+.endif
+        @ Load the filter vector
+        vld1.16         {q0},  [r12,:128]
+1:
+.if \size >= 16
+        mov             r12, r5
+.endif
+        @ Load src
+.if \size >= 16
+        vld1.8          {d18, d19, d20}, [r2]!
+        vld1.8          {d24, d25, d26}, [r7]!
+.else
+        vld1.8          {q9},  [r2]
+        vld1.8          {q12}, [r7]
+.endif
+        vmovl.u8        q8,  d18
+        vmovl.u8        q9,  d19
+        vmovl.u8        q11, d24
+        vmovl.u8        q12, d25
+.if \size >= 16
+        vmovl.u8        q10, d20
+        vmovl.u8        q13, d26
+.endif
+2:
+
+        @ Accumulate, adding idx2 last with a separate
+        @ saturating add. The positive filter coefficients
+        @ for all indices except idx2 must add up to less
+        @ than 127 for this not to overflow.
+        vmul.s16        q1,  q8,  d0[0]
+        vmul.s16        q3,  q11, d0[0]
+.if \size >= 16
+        vmul.s16        q2,  q9,  d0[0]
+        vmul.s16        q4,  q12, d0[0]
+.endif
+        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 1,     \size
+        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 2,     \size
+        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, \idx1, \size
+        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 5,     \size
+        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 6,     \size
+        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 7,     \size
+        extmulqadd      q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, \idx2, \size
+
+        @ Round, shift and saturate
+        vqrshrun.s16    d2,  q1,  #7
+        vqrshrun.s16    d6,  q3,  #7
+.if \size >= 16
+        vqrshrun.s16    d3,  q2,  #7
+        vqrshrun.s16    d7,  q4,  #7
+.endif
+        @ Average
+.ifc \type,avg
+.if \size >= 16
+        vld1.8          {q14}, [r0,:128]
+        vld1.8          {q15}, [r6,:128]
+        vrhadd.u8       q1,  q1,  q14
+        vrhadd.u8       q3,  q3,  q15
+.elseif \size == 8
+        vld1.8          {d28}, [r0,:64]
+        vld1.8          {d30}, [r6,:64]
+        vrhadd.u8       d2,  d2,  d28
+        vrhadd.u8       d6,  d6,  d30
+.else
+        @ We only need d28[0], but [] is faster on some cores
+        vld1.32         {d28[]}, [r0,:32]
+        vld1.32         {d30[]}, [r6,:32]
+        vrhadd.u8       d2,  d2,  d28
+        vrhadd.u8       d6,  d6,  d30
+.endif
+.endif
+        @ Store and loop horizontally (for size >= 16)
+.if \size >= 16
+        subs            r12, r12, #16
+        vst1.8          {q1}, [r0,:128]!
+        vst1.8          {q3}, [r6,:128]!
+        beq             3f
+        vmov            q8,  q10
+        vmov            q11, q13
+        vld1.8          {q10}, [r2]!
+        vld1.8          {q13}, [r7]!
+        vmovl.u8        q9,  d20
+        vmovl.u8        q10, d21
+        vmovl.u8        q12, d26
+        vmovl.u8        q13, d27
+        b               2b
+.elseif \size == 8
+        vst1.8          {d2}, [r0,:64]
+        vst1.8          {d6}, [r6,:64]
+.else @ \size == 4
+        vst1.32         {d2[0]}, [r0,:32]
+        vst1.32         {d6[0]}, [r6,:32]
+.endif
+3:
+        @ Loop vertically
+        add             r0,  r0,  r1
+        add             r6,  r6,  r1
+        add             r2,  r2,  r3
+        add             r7,  r7,  r3
+        subs            r4,  r4,  #2
+        bne             1b
+.if \size >= 16
+        vpop            {q4-q6}
+.endif
+        pop             {r4-r7}
+        bx              lr
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size, 3, 4
+do_8tap_h avg, \size, 3, 4
+do_8tap_h put, \size, 4, 3
+do_8tap_h avg, \size, 4, 3
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+do_8tap_h_size 16
+
+.macro do_8tap_h_func type, filter, offset, size
+function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
+        push            {r4-r7}
+.if \size >= 16
+        vpush           {q4-q6}
+        ldr             r4,  [sp, #64]
+        ldr             r5,  [sp, #68]
+.else
+        ldr             r4,  [sp, #16]
+        ldr             r5,  [sp, #20]
+.endif
+        movrelx         r12, X(ff_vp9_subpel_filters), r6
+        add             r12, r12, 256*\offset
+        cmp             r5,  #8
+        add             r12, r12, r5, lsl #4
+        mov             r5,  #\size
+.if \size >= 16
+        bge             \type\()_8tap_16h_34
+        b               \type\()_8tap_16h_43
+.else
+        bge             \type\()_8tap_\size\()h_34
+        b               \type\()_8tap_\size\()h_43
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size
+do_8tap_h_func put, regular, 1, \size
+do_8tap_h_func avg, regular, 1, \size
+do_8tap_h_func put, sharp,   2, \size
+do_8tap_h_func avg, sharp,   2, \size
+do_8tap_h_func put, smooth,  0, \size
+do_8tap_h_func avg, smooth,  0, \size
+.endm
+
+do_8tap_h_filters 64
+do_8tap_h_filters 32
+do_8tap_h_filters 16
+do_8tap_h_filters 8
+do_8tap_h_filters 4
+
+.ltorg
+
+@ Vertical filters
+
+@ Round, shift and saturate and store qreg1-2 over 4 lines
+.macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type
+        vqrshrun.s16    \dreg1,  \qreg1, #7
+        vqrshrun.s16    \dreg2,  \qreg2, #7
+.ifc \type,avg
+        vld1.32         {\tmp1[]},   [r0,:32], r1
+        vld1.32         {\tmp2[]},   [r0,:32], r1
+        vld1.32         {\tmp1[1]},  [r0,:32], r1
+        vld1.32         {\tmp2[1]},  [r0,:32], r1
+        vrhadd.u8       \dreg1,  \dreg1,  \tmp1
+        vrhadd.u8       \dreg2,  \dreg2,  \tmp2
+        sub             r0,  r0,  r1, lsl #2
+.endif
+        vst1.32         {\dreg1[0]}, [r0,:32], r1
+        vst1.32         {\dreg2[0]}, [r0,:32], r1
+        vst1.32         {\dreg1[1]}, [r0,:32], r1
+        vst1.32         {\dreg2[1]}, [r0,:32], r1
+.endm
+
+@ Round, shift and saturate and store qreg1-4
+.macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type
+        vqrshrun.s16    \dreg1,  \qreg1, #7
+        vqrshrun.s16    \dreg2,  \qreg2, #7
+        vqrshrun.s16    \dreg3,  \qreg3, #7
+        vqrshrun.s16    \dreg4,  \qreg4, #7
+.ifc \type,avg
+        vld1.8          {\tmp1},  [r0,:64], r1
+        vld1.8          {\tmp2},  [r0,:64], r1
+        vld1.8          {\tmp3},  [r0,:64], r1
+        vld1.8          {\tmp4},  [r0,:64], r1
+        vrhadd.u8       \dreg1,  \dreg1,  \tmp1
+        vrhadd.u8       \dreg2,  \dreg2,  \tmp2
+        vrhadd.u8       \dreg3,  \dreg3,  \tmp3
+        vrhadd.u8       \dreg4,  \dreg4,  \tmp4
+        sub             r0,  r0,  r1, lsl #2
+.endif
+        vst1.8          {\dreg1}, [r0,:64], r1
+        vst1.8          {\dreg2}, [r0,:64], r1
+        vst1.8          {\dreg3}, [r0,:64], r1
+        vst1.8          {\dreg4}, [r0,:64], r1
+.endm
+
+@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+@ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
+@ at the end with saturation. Indices 0 and 7 always have negative or zero
+@ coefficients, so they can be accumulated into tmp1-tmp2 together with the
+@ largest coefficient.
+.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
+        vmul.s16        \dst1, \src2, d0[1]
+        vmul.s16        \dst2, \src3, d0[1]
+        vmul.s16        \tmp1, \src1, d0[0]
+        vmul.s16        \tmp2, \src2, d0[0]
+        vmla.s16        \dst1, \src3, d0[2]
+        vmla.s16        \dst2, \src4, d0[2]
+.if \idx1 == 3
+        vmla.s16        \dst1, \src4, d0[3]
+        vmla.s16        \dst2, \src5, d0[3]
+.else
+        vmla.s16        \dst1, \src5, d1[0]
+        vmla.s16        \dst2, \src6, d1[0]
+.endif
+        vmla.s16        \dst1, \src6, d1[1]
+        vmla.s16        \dst2, \src7, d1[1]
+        vmla.s16        \tmp1, \src8, d1[3]
+        vmla.s16        \tmp2, \src9, d1[3]
+        vmla.s16        \dst1, \src7, d1[2]
+        vmla.s16        \dst2, \src8, d1[2]
+.if \idx2 == 3
+        vmla.s16        \tmp1, \src4, d0[3]
+        vmla.s16        \tmp2, \src5, d0[3]
+.else
+        vmla.s16        \tmp1, \src5, d1[0]
+        vmla.s16        \tmp2, \src6, d1[0]
+.endif
+        vqadd.s16       \dst1, \dst1, \tmp1
+        vqadd.s16       \dst2, \dst2, \tmp2
+.endm
+
+@ Load pixels and extend them to 16 bit
+.macro loadl dst1, dst2, dst3, dst4
+        vld1.8          {d2}, [r2], r3
+        vld1.8          {d3}, [r2], r3
+        vld1.8          {d4}, [r2], r3
+.ifnb \dst4
+        vld1.8          {d5}, [r2], r3
+.endif
+        vmovl.u8        \dst1, d2
+        vmovl.u8        \dst2, d3
+        vmovl.u8        \dst3, d4
+.ifnb \dst4
+        vmovl.u8        \dst4, d5
+.endif
+.endm
+
+@ Instantiate a vertical filter function for filtering 8 pixels at a time.
+@ The height is passed in r4, the width in r5 and the filter coefficients
+@ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
+@ and idx1 is the other one of them.
+.macro do_8tap_8v type, idx1, idx2
+function \type\()_8tap_8v_\idx1\idx2
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        vld1.16         {q0},  [r12, :128]
+1:
+        mov             r12, r4
+
+        loadl           q5,  q6,  q7
+        loadl           q8,  q9,  q10, q11
+2:
+        loadl           q12, q13, q14, q15
+        convolve        q1,  q2,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12, q13, \idx1, \idx2, q4,  q5
+        convolve        q3,  q4,  q7,  q8,  q9,  q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5,  q6
+        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q4,  d8,  d3,  d5,  d7,  d9,  \type
+
+        subs            r12, r12, #4
+        beq             8f
+
+        loadl           q4,  q5,  q6,  q7
+        convolve        q1,  q2,  q9,  q10, q11, q12, q13, q14, q15, q4,  q5,  \idx1, \idx2, q8,  q9
+        convolve        q3,  q8,  q11, q12, q13, q14, q15, q4,  q5,  q6,  q7,  \idx1, \idx2, q9,  q10
+        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q8,  d16, d3,  d5,  d7,  d17, \type
+
+        subs            r12, r12, #4
+        beq             8f
+
+        loadl           q8,  q9,  q10, q11
+        convolve        q1,  q2,  q13, q14, q15, q4,  q5,  q6,  q7,  q8,  q9,  \idx1, \idx2, q12, q13
+        convolve        q3,  q12, q15, q4,  q5,  q6,  q7,  q8,  q9,  q10, q11, \idx1, \idx2, q13, q14
+        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q12, d24, d3,  d5,  d7,  d25, \type
+
+        subs            r12, r12, #4
+        bne             2b
+
+8:
+        subs            r5,  r5,  #8
+        beq             9f
+        @ r0 -= h * dst_stride
+        mls             r0,  r1,  r4, r0
+        @ r2 -= h * src_stride
+        mls             r2,  r3,  r4, r2
+        @ r2 -= 8 * src_stride
+        sub             r2,  r2,  r3, lsl #3
+        @ r2 += 1 * src_stride
+        add             r2,  r2,  r3
+        add             r2,  r2,  #8
+        add             r0,  r0,  #8
+        b               1b
+9:
+        vpop            {q4-q7}
+        pop             {r4-r5}
+        bx              lr
+endfunc
+.endm
+
+do_8tap_8v put, 3, 4
+do_8tap_8v put, 4, 3
+do_8tap_8v avg, 3, 4
+do_8tap_8v avg, 4, 3
+
+@ Instantiate a vertical filter function for filtering a 4 pixels wide
+@ slice. The first half of the registers contain one row, while the second
+@ half of a register contains the second-next row (also stored in the first
+@ half of the register two steps ahead). The convolution does two outputs
+@ at a time; the output of q5-q12 into one, and q4-q13 into another one.
+@ The first half of first output is the first output row, the first half
+@ of the other output is the second output row. The second halves of the
+@ registers are rows 3 and 4.
+@ This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type, idx1, idx2
+function \type\()_8tap_4v_\idx1\idx2
+        sub             r2,  r2,  r3, lsl #1
+        sub             r2,  r2,  r3
+        vld1.16         {q0},  [r12, :128]
+
+        vld1.32         {d2[]},   [r2], r3
+        vld1.32         {d3[]},   [r2], r3
+        vld1.32         {d4[]},   [r2], r3
+        vld1.32         {d5[]},   [r2], r3
+        vld1.32         {d6[]},   [r2], r3
+        vld1.32         {d7[]},   [r2], r3
+        vext.8          d2,  d2,  d4,  #4
+        vld1.32         {d8[]},   [r2], r3
+        vext.8          d3,  d3,  d5,  #4
+        vld1.32         {d9[]},   [r2], r3
+        vmovl.u8        q5,  d2
+        vext.8          d4,  d4,  d6,  #4
+        vld1.32         {d28[]},  [r2], r3
+        vmovl.u8        q6,  d3
+        vext.8          d5,  d5,  d7,  #4
+        vld1.32         {d29[]},  [r2], r3
+        vmovl.u8        q7,  d4
+        vext.8          d6,  d6,  d8,  #4
+        vld1.32         {d30[]},  [r2], r3
+        vmovl.u8        q8,  d5
+        vext.8          d7,  d7,  d9,  #4
+        vmovl.u8        q9,  d6
+        vext.8          d8,  d8,  d28, #4
+        vmovl.u8        q10, d7
+        vext.8          d9,  d9,  d29, #4
+        vmovl.u8        q11, d8
+        vext.8          d28, d28, d30, #4
+        vmovl.u8        q12, d9
+        vmovl.u8        q13, d28
+
+        convolve        q1,  q2,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12, q13, \idx1, \idx2, q4, q3
+        do_store4       q1,  d2,  q2,  d4,  d3,  d5,  \type
+        subs            r4,  r4,  #4
+        beq             9f
+
+        vld1.32         {d2[]},   [r2], r3
+        vld1.32         {d3[]},   [r2], r3
+        vext.8          d29, d29, d2,  #4
+        vext.8          d30, d30, d3,  #4
+        vld1.32         {d2[1]},  [r2], r3
+        vmovl.u8        q14, d29
+        vld1.32         {d3[1]},  [r2], r3
+        vmovl.u8        q15, d30
+        vmovl.u8        q5,  d2
+        vmovl.u8        q6,  d3
+
+        convolve        q1,  q2,  q9,  q10, q11, q12, q13, q14, q15, q5,  q6,  \idx1, \idx2, q4, q3
+        do_store4       q1,  d2,  q2,  d4,  d3,  d5,  \type
+
+9:
+        vpop            {q4-q7}
+        pop             {r4-r5}
+        bx              lr
+endfunc
+.endm
+
+do_8tap_4v put, 3, 4
+do_8tap_4v put, 4, 3
+do_8tap_4v avg, 3, 4
+do_8tap_4v avg, 4, 3
+
+.macro do_8tap_v_func type, filter, offset, size
+function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
+        push            {r4-r5}
+        vpush           {q4-q7}
+        ldr             r4,  [sp, #72]
+        movrelx         r12, X(ff_vp9_subpel_filters), r5
+        ldr             r5,  [sp, #80]
+        add             r12, r12, 256*\offset
+        add             r12, r12, r5, lsl #4
+        cmp             r5,  #8
+        mov             r5,  #\size
+.if \size >= 8
+        bge             \type\()_8tap_8v_34
+        b               \type\()_8tap_8v_43
+.else
+        bge             \type\()_8tap_4v_34
+        b               \type\()_8tap_4v_43
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size
+do_8tap_v_func put, regular, 1, \size
+do_8tap_v_func avg, regular, 1, \size
+do_8tap_v_func put, sharp,   2, \size
+do_8tap_v_func avg, sharp,   2, \size
+do_8tap_v_func put, smooth,  0, \size
+do_8tap_v_func avg, smooth,  0, \size
+.endm
+
+do_8tap_v_filters 64
+do_8tap_v_filters 32
+do_8tap_v_filters 16
+do_8tap_v_filters 8
+do_8tap_v_filters 4