From d5dd8c7bf0f0d77c581db3236e0d938f06fd5591 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 18 Dec 2013 15:56:50 +0100 Subject: aarch64: h264 qpel NEON optimizations Ported from ARMv7 NEON. --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/h264qpel_init_aarch64.c | 172 ++++++ libavcodec/aarch64/h264qpel_neon.S | 934 +++++++++++++++++++++++++++++ libavcodec/aarch64/neon.S | 64 ++ 4 files changed, 1172 insertions(+) create mode 100644 libavcodec/aarch64/h264qpel_init_aarch64.c create mode 100644 libavcodec/aarch64/h264qpel_neon.S (limited to 'libavcodec/aarch64') diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 06e3c778ad..1d80d9a268 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -1,7 +1,9 @@ OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o +OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264idct_neon.o +NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c new file mode 100644 index 0000000000..11611df241 --- /dev/null +++ b/libavcodec/aarch64/h264qpel_init_aarch64.c @@ -0,0 +1,172 @@ +/* + * ARM NEON optimised DSP functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/h264qpel.h" + +void ff_put_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel16_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); + +void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); + +void ff_avg_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel16_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); + +void ff_avg_h264_qpel8_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); +void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride); + +av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth) +{ + const int high_bit_depth = bit_depth > 8; + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags) && !high_bit_depth) { + /* c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; */ + c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; + c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; + c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; + c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; + c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; + c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; + c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; + c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; + c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; + c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; + c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; + c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; + c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; + c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; + c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; + + /* c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; */ + c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; + c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; + c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; + c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; + c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; + c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; + c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; + c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; + c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; + c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; + c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; + c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; + c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; + c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; + c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; + + /* c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; */ + c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon; + c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon; + c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon; + c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon; + c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon; + c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon; + c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon; + c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon; + c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon; + c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon; + c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon; + c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon; + c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon; + c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon; + c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon; + + /* c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon; */ + c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon; + c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon; + c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon; + c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon; + c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon; + c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon; + c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon; + c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon; + c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon; + c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon; + c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon; + c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon; + c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon; + c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon; + c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon; + } +} diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S new file mode 100644 index 0000000000..731dc0658d --- /dev/null +++ b/libavcodec/aarch64/h264qpel_neon.S @@ -0,0 +1,934 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * Copyright (c) 2013 Janne Grunau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + + /* H.264 qpel MC */ + +.macro lowpass_const r + movz \r, #20, lsl #16 + movk \r, #5 + mov v6.S[0], \r +.endm + +//trashes v0-v5 +.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 + ext v2.8B, \r0\().8B, \r1\().8B, #2 + ext v3.8B, \r0\().8B, \r1\().8B, #3 + uaddl v2.8H, v2.8B, v3.8B + ext v4.8B, \r0\().8B, \r1\().8B, #1 + ext v5.8B, \r0\().8B, \r1\().8B, #4 + uaddl v4.8H, v4.8B, v5.8B + ext v1.8B, \r0\().8B, \r1\().8B, #5 + uaddl \d0\().8H, \r0\().8B, v1.8B + ext v0.8B, \r2\().8B, \r3\().8B, #2 + mla \d0\().8H, v2.8H, v6.H[1] + ext v1.8B, \r2\().8B, \r3\().8B, #3 + uaddl v0.8H, v0.8B, v1.8B + ext v1.8B, \r2\().8B, \r3\().8B, #1 + mls \d0\().8H, v4.8H, v6.H[0] + ext v3.8B, \r2\().8B, \r3\().8B, #4 + uaddl v1.8H, v1.8B, v3.8B + ext v2.8B, \r2\().8B, \r3\().8B, #5 + uaddl \d1\().8H, \r2\().8B, v2.8B + mla \d1\().8H, v0.8H, v6.H[1] + mls \d1\().8H, v1.8H, v6.H[0] + .if \narrow + sqrshrun \d0\().8B, \d0\().8H, #5 + sqrshrun \d1\().8B, \d1\().8H, #5 + .endif +.endm + +//trashes v0-v5, v7, v30-v31 +.macro lowpass_8H r0, r1 + ext v0.16B, \r0\().16B, \r0\().16B, #2 + ext v1.16B, \r0\().16B, \r0\().16B, #3 + uaddl v0.8H, v0.8B, v1.8B + ext v2.16B, \r0\().16B, \r0\().16B, #1 + ext v3.16B, \r0\().16B, \r0\().16B, #4 + uaddl v2.8H, v2.8B, v3.8B + ext v30.16B, \r0\().16B, \r0\().16B, #5 + uaddl \r0\().8H, \r0\().8B, v30.8B + ext v4.16B, \r1\().16B, \r1\().16B, #2 + mla \r0\().8H, v0.8H, v6.H[1] + ext v5.16B, \r1\().16B, \r1\().16B, #3 + uaddl v4.8H, v4.8B, v5.8B + ext v7.16B, \r1\().16B, \r1\().16B, #1 + mls \r0\().8H, v2.8H, v6.H[0] + ext v0.16B, \r1\().16B, \r1\().16B, #4 + uaddl v7.8H, v7.8B, v0.8B + ext v31.16B, \r1\().16B, \r1\().16B, #5 + uaddl \r1\().8H, \r1\().8B, v31.8B + mla \r1\().8H, v4.8H, v6.H[1] + mls \r1\().8H, v7.8H, v6.H[0] +.endm + +// trashes v2-v5, v30 +.macro lowpass_8_1 r0, r1, d0, narrow=1 + ext v2.8B, \r0\().8B, \r1\().8B, #2 + ext v3.8B, \r0\().8B, \r1\().8B, #3 + uaddl v2.8H, v2.8B, v3.8B + ext v4.8B, \r0\().8B, \r1\().8B, #1 + ext v5.8B, \r0\().8B, \r1\().8B, #4 + uaddl v4.8H, v4.8B, v5.8B + ext v30.8B, \r0\().8B, \r1\().8B, #5 + uaddl \d0\().8H, \r0\().8B, v30.8B + mla \d0\().8H, v2.8H, v6.H[1] + mls \d0\().8H, v4.8H, v6.H[0] + .if \narrow + sqrshrun \d0\().8B, \d0\().8H, #5 + .endif +.endm + +// trashed v0-v7 +.macro lowpass_8.16 r0, r1, r2 + ext v1.16B, \r0\().16B, \r1\().16B, #4 + ext v0.16B, \r0\().16B, \r1\().16B, #6 + saddl v5.4S, v1.4H, v0.4H + ext v2.16B, \r0\().16B, \r1\().16B, #2 + saddl2 v1.4S, v1.8H, v0.8H + ext v3.16B, \r0\().16B, \r1\().16B, #8 + saddl v6.4S, v2.4H, v3.4H + ext \r1\().16B, \r0\().16B, \r1\().16B, #10 + saddl2 v2.4S, v2.8H, v3.8H + saddl v0.4S, \r0\().4H, \r1\().4H + saddl2 v4.4S, \r0\().8H, \r1\().8H + + shl v3.4S, v5.4S, #4 + shl v5.4S, v5.4S, #2 + shl v7.4S, v6.4S, #2 + add v5.4S, v5.4S, v3.4S + add v6.4S, v6.4S, v7.4S + + shl v3.4S, v1.4S, #4 + shl v1.4S, v1.4S, #2 + shl v7.4S, v2.4S, #2 + add v1.4S, v1.4S, v3.4S + add v2.4S, v2.4S, v7.4S + + add v5.4S, v5.4S, v0.4S + sub v5.4S, v5.4S, v6.4S + + add v1.4S, v1.4S, v4.4S + sub v1.4S, v1.4S, v2.4S + + rshrn v5.4H, v5.4S, #10 + rshrn2 v5.8H, v1.4S, #10 + + sqxtun \r2\().8B, v5.8H +.endm + +function put_h264_qpel16_h_lowpass_neon_packed + mov x4, x30 + mov x12, #16 + mov x3, #8 + bl put_h264_qpel8_h_lowpass_neon + sub x1, x1, x2, lsl #4 + add x1, x1, #8 + mov x12, #16 + mov x30, x4 + b put_h264_qpel8_h_lowpass_neon +endfunc + +.macro h264_qpel_h_lowpass type +function \type\()_h264_qpel16_h_lowpass_neon + mov x13, x30 + mov x12, #16 + bl \type\()_h264_qpel8_h_lowpass_neon + sub x0, x0, x3, lsl #4 + sub x1, x1, x2, lsl #4 + add x0, x0, #8 + add x1, x1, #8 + mov x12, #16 + mov x30, x13 +endfunc + +function \type\()_h264_qpel8_h_lowpass_neon +1: ld1 {v28.8B, v29.8B}, [x1], x2 + ld1 {v16.8B, v17.8B}, [x1], x2 + subs x12, x12, #2 + lowpass_8 v28, v29, v16, v17, v28, v16 + .ifc \type,avg + ld1 {v2.8B}, [x0], x3 + urhadd v28.8B, v28.8B, v2.8B + ld1 {v3.8B}, [x0] + urhadd v16.8B, v16.8B, v3.8B + sub x0, x0, x3 + .endif + st1 {v28.8B}, [x0], x3 + st1 {v16.8B}, [x0], x3 + b.ne 1b + ret +endfunc +.endm + + h264_qpel_h_lowpass put + h264_qpel_h_lowpass avg + +.macro h264_qpel_h_lowpass_l2 type +function \type\()_h264_qpel16_h_lowpass_l2_neon + mov x13, x30 + mov x12, #16 + bl \type\()_h264_qpel8_h_lowpass_l2_neon + sub x0, x0, x2, lsl #4 + sub x1, x1, x2, lsl #4 + sub x3, x3, x2, lsl #4 + add x0, x0, #8 + add x1, x1, #8 + add x3, x3, #8 + mov x12, #16 + mov x30, x13 +endfunc + +function \type\()_h264_qpel8_h_lowpass_l2_neon +1: ld1 {v26.8B, v27.8B}, [x1], x2 + ld1 {v16.8B, v17.8B}, [x1], x2 + ld1 {v28.8B}, [x3], x2 + ld1 {v29.8B}, [x3], x2 + subs x12, x12, #2 + lowpass_8 v26, v27, v16, v17, v26, v27 + urhadd v26.8B, v26.8B, v28.8B + urhadd v27.8B, v27.8B, v29.8B + .ifc \type,avg + ld1 {v2.8B}, [x0], x2 + urhadd v26.8B, v26.8B, v2.8B + ld1 {v3.8B}, [x0] + urhadd v27.8B, v27.8B, v3.8B + sub x0, x0, x2 + .endif + st1 {v26.8B}, [x0], x2 + st1 {v27.8B}, [x0], x2 + b.ne 1b + ret +endfunc +.endm + + h264_qpel_h_lowpass_l2 put + h264_qpel_h_lowpass_l2 avg + +function put_h264_qpel16_v_lowpass_neon_packed + mov x4, x30 + mov x2, #8 + bl put_h264_qpel8_v_lowpass_neon + sub x1, x1, x3, lsl #2 + bl put_h264_qpel8_v_lowpass_neon + sub x1, x1, x3, lsl #4 + sub x1, x1, x3, lsl #2 + add x1, x1, #8 + bl put_h264_qpel8_v_lowpass_neon + sub x1, x1, x3, lsl #2 + mov x30, x4 + b put_h264_qpel8_v_lowpass_neon +endfunc + +.macro h264_qpel_v_lowpass type +function \type\()_h264_qpel16_v_lowpass_neon + mov x4, x30 + bl \type\()_h264_qpel8_v_lowpass_neon + sub x1, x1, x3, lsl #2 + bl \type\()_h264_qpel8_v_lowpass_neon + sub x0, x0, x2, lsl #4 + add x0, x0, #8 + sub x1, x1, x3, lsl #4 + sub x1, x1, x3, lsl #2 + add x1, x1, #8 + bl \type\()_h264_qpel8_v_lowpass_neon + sub x1, x1, x3, lsl #2 + mov x30, x4 +endfunc + +function \type\()_h264_qpel8_v_lowpass_neon + ld1 {v16.8B}, [x1], x3 + ld1 {v18.8B}, [x1], x3 + ld1 {v20.8B}, [x1], x3 + ld1 {v22.8B}, [x1], x3 + ld1 {v24.8B}, [x1], x3 + ld1 {v26.8B}, [x1], x3 + ld1 {v28.8B}, [x1], x3 + ld1 {v30.8B}, [x1], x3 + ld1 {v17.8B}, [x1], x3 + ld1 {v19.8B}, [x1], x3 + ld1 {v21.8B}, [x1], x3 + ld1 {v23.8B}, [x1], x3 + ld1 {v25.8B}, [x1] + + transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 + transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 + lowpass_8 v16, v17, v18, v19, v16, v17 + lowpass_8 v20, v21, v22, v23, v18, v19 + lowpass_8 v24, v25, v26, v27, v20, v21 + lowpass_8 v28, v29, v30, v31, v22, v23 + transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + + .ifc \type,avg + ld1 {v24.8B}, [x0], x2 + urhadd v16.8B, v16.8B, v24.8B + ld1 {v25.8B}, [x0], x2 + urhadd v17.8B, v17.8B, v25.8B + ld1 {v26.8B}, [x0], x2 + urhadd v18.8B, v18.8B, v26.8B + ld1 {v27.8B}, [x0], x2 + urhadd v19.8B, v19.8B, v27.8B + ld1 {v28.8B}, [x0], x2 + urhadd v20.8B, v20.8B, v28.8B + ld1 {v29.8B}, [x0], x2 + urhadd v21.8B, v21.8B, v29.8B + ld1 {v30.8B}, [x0], x2 + urhadd v22.8B, v22.8B, v30.8B + ld1 {v31.8B}, [x0], x2 + urhadd v23.8B, v23.8B, v31.8B + sub x0, x0, x2, lsl #3 + .endif + + st1 {v16.8B}, [x0], x2 + st1 {v17.8B}, [x0], x2 + st1 {v18.8B}, [x0], x2 + st1 {v19.8B}, [x0], x2 + st1 {v20.8B}, [x0], x2 + st1 {v21.8B}, [x0], x2 + st1 {v22.8B}, [x0], x2 + st1 {v23.8B}, [x0], x2 + + ret +endfunc +.endm + + h264_qpel_v_lowpass put + h264_qpel_v_lowpass avg + +.macro h264_qpel_v_lowpass_l2 type +function \type\()_h264_qpel16_v_lowpass_l2_neon + mov x4, x30 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + sub x1, x1, x3, lsl #2 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + sub x0, x0, x3, lsl #4 + sub x12, x12, x2, lsl #4 + add x0, x0, #8 + add x12, x12, #8 + sub x1, x1, x3, lsl #4 + sub x1, x1, x3, lsl #2 + add x1, x1, #8 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + sub x1, x1, x3, lsl #2 + mov x30, x4 +endfunc + +function \type\()_h264_qpel8_v_lowpass_l2_neon + ld1 {v16.8B}, [x1], x3 + ld1 {v18.8B}, [x1], x3 + ld1 {v20.8B}, [x1], x3 + ld1 {v22.8B}, [x1], x3 + ld1 {v24.8B}, [x1], x3 + ld1 {v26.8B}, [x1], x3 + ld1 {v28.8B}, [x1], x3 + ld1 {v30.8B}, [x1], x3 + ld1 {v17.8B}, [x1], x3 + ld1 {v19.8B}, [x1], x3 + ld1 {v21.8B}, [x1], x3 + ld1 {v23.8B}, [x1], x3 + ld1 {v25.8B}, [x1] + + transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 + transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 + lowpass_8 v16, v17, v18, v19, v16, v17 + lowpass_8 v20, v21, v22, v23, v18, v19 + lowpass_8 v24, v25, v26, v27, v20, v21 + lowpass_8 v28, v29, v30, v31, v22, v23 + transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + + ld1 {v24.8B}, [x12], x2 + ld1 {v25.8B}, [x12], x2 + ld1 {v26.8B}, [x12], x2 + ld1 {v27.8B}, [x12], x2 + ld1 {v28.8B}, [x12], x2 + urhadd v16.8B, v24.8B, v16.8B + urhadd v17.8B, v25.8B, v17.8B + ld1 {v29.8B}, [x12], x2 + urhadd v18.8B, v26.8B, v18.8B + urhadd v19.8B, v27.8B, v19.8B + ld1 {v30.8B}, [x12], x2 + urhadd v20.8B, v28.8B, v20.8B + urhadd v21.8B, v29.8B, v21.8B + ld1 {v31.8B}, [x12], x2 + urhadd v22.8B, v30.8B, v22.8B + urhadd v23.8B, v31.8B, v23.8B + + .ifc \type,avg + ld1 {v24.8B}, [x0], x3 + urhadd v16.8B, v16.8B, v24.8B + ld1 {v25.8B}, [x0], x3 + urhadd v17.8B, v17.8B, v25.8B + ld1 {v26.8B}, [x0], x3 + urhadd v18.8B, v18.8B, v26.8B + ld1 {v27.8B}, [x0], x3 + urhadd v19.8B, v19.8B, v27.8B + ld1 {v28.8B}, [x0], x3 + urhadd v20.8B, v20.8B, v28.8B + ld1 {v29.8B}, [x0], x3 + urhadd v21.8B, v21.8B, v29.8B + ld1 {v30.8B}, [x0], x3 + urhadd v22.8B, v22.8B, v30.8B + ld1 {v31.8B}, [x0], x3 + urhadd v23.8B, v23.8B, v31.8B + sub x0, x0, x3, lsl #3 + .endif + + st1 {v16.8B}, [x0], x3 + st1 {v17.8B}, [x0], x3 + st1 {v18.8B}, [x0], x3 + st1 {v19.8B}, [x0], x3 + st1 {v20.8B}, [x0], x3 + st1 {v21.8B}, [x0], x3 + st1 {v22.8B}, [x0], x3 + st1 {v23.8B}, [x0], x3 + + ret +endfunc +.endm + + h264_qpel_v_lowpass_l2 put + h264_qpel_v_lowpass_l2 avg + +function put_h264_qpel8_hv_lowpass_neon_top + lowpass_const w12 + ld1 {v16.8H}, [x1], x3 + ld1 {v17.8H}, [x1], x3 + ld1 {v18.8H}, [x1], x3 + ld1 {v19.8H}, [x1], x3 + ld1 {v20.8H}, [x1], x3 + ld1 {v21.8H}, [x1], x3 + ld1 {v22.8H}, [x1], x3 + ld1 {v23.8H}, [x1], x3 + ld1 {v24.8H}, [x1], x3 + ld1 {v25.8H}, [x1], x3 + ld1 {v26.8H}, [x1], x3 + ld1 {v27.8H}, [x1], x3 + ld1 {v28.8H}, [x1] + lowpass_8H v16, v17 + lowpass_8H v18, v19 + lowpass_8H v20, v21 + lowpass_8H v22, v23 + lowpass_8H v24, v25 + lowpass_8H v26, v27 + lowpass_8H v28, v29 + + transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 + + lowpass_8.16 v16, v24, v16 + lowpass_8.16 v17, v25, v17 + + lowpass_8.16 v18, v26, v18 + lowpass_8.16 v19, v27, v19 + + lowpass_8.16 v20, v28, v20 + lowpass_8.16 v21, v29, v21 + + lowpass_8.16 v22, v30, v22 + lowpass_8.16 v23, v31, v23 + + transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 + + ret +endfunc + +.macro h264_qpel8_hv_lowpass type +function \type\()_h264_qpel8_hv_lowpass_neon + mov x10, x30 + bl put_h264_qpel8_hv_lowpass_neon_top + .ifc \type,avg + ld1 {v0.8B}, [x0], x2 + urhadd v16.8B, v16.8B, v0.8B + ld1 {v1.8B}, [x0], x2 + urhadd v17.8B, v17.8B, v1.8B + ld1 {v2.8B}, [x0], x2 + urhadd v18.8B, v18.8B, v2.8B + ld1 {v3.8B}, [x0], x2 + urhadd v19.8B, v19.8B, v3.8B + ld1 {v4.8B}, [x0], x2 + urhadd v20.8B, v20.8B, v4.8B + ld1 {v5.8B}, [x0], x2 + urhadd v21.8B, v21.8B, v5.8B + ld1 {v6.8B}, [x0], x2 + urhadd v22.8B, v22.8B, v6.8B + ld1 {v7.8B}, [x0], x2 + urhadd v23.8B, v23.8B, v7.8B + sub x0, x0, x2, lsl #3 + .endif + + st1 {v16.8B}, [x0], x2 + st1 {v17.8B}, [x0], x2 + st1 {v18.8B}, [x0], x2 + st1 {v19.8B}, [x0], x2 + st1 {v20.8B}, [x0], x2 + st1 {v21.8B}, [x0], x2 + st1 {v22.8B}, [x0], x2 + st1 {v23.8B}, [x0], x2 + + ret x10 +endfunc +.endm + + h264_qpel8_hv_lowpass put + h264_qpel8_hv_lowpass avg + +.macro h264_qpel8_hv_lowpass_l2 type +function \type\()_h264_qpel8_hv_lowpass_l2_neon + mov x10, x30 + bl put_h264_qpel8_hv_lowpass_neon_top + + ld1 {v0.8B, v1.8B}, [x2], #16 + ld1 {v2.8B, v3.8B}, [x2], #16 + urhadd v0.8B, v0.8B, v16.8B + urhadd v1.8B, v1.8B, v17.8B + ld1 {v4.8B, v5.8B}, [x2], #16 + urhadd v2.8B, v2.8B, v18.8B + urhadd v3.8B, v3.8B, v19.8B + ld1 {v6.8B, v7.8B}, [x2], #16 + urhadd v4.8B, v4.8B, v20.8B + urhadd v5.8B, v5.8B, v21.8B + urhadd v6.8B, v6.8B, v22.8B + urhadd v7.8B, v7.8B, v23.8B + .ifc \type,avg + ld1 {v16.8B}, [x0], x3 + urhadd v0.8B, v0.8B, v16.8B + ld1 {v17.8B}, [x0], x3 + urhadd v1.8B, v1.8B, v17.8B + ld1 {v18.8B}, [x0], x3 + urhadd v2.8B, v2.8B, v18.8B + ld1 {v19.8B}, [x0], x3 + urhadd v3.8B, v3.8B, v19.8B + ld1 {v20.8B}, [x0], x3 + urhadd v4.8B, v4.8B, v20.8B + ld1 {v21.8B}, [x0], x3 + urhadd v5.8B, v5.8B, v21.8B + ld1 {v22.8B}, [x0], x3 + urhadd v6.8B, v6.8B, v22.8B + ld1 {v23.8B}, [x0], x3 + urhadd v7.8B, v7.8B, v23.8B + sub x0, x0, x3, lsl #3 + .endif + st1 {v0.8B}, [x0], x3 + st1 {v1.8B}, [x0], x3 + st1 {v2.8B}, [x0], x3 + st1 {v3.8B}, [x0], x3 + st1 {v4.8B}, [x0], x3 + st1 {v5.8B}, [x0], x3 + st1 {v6.8B}, [x0], x3 + st1 {v7.8B}, [x0], x3 + + ret x10 +endfunc +.endm + + h264_qpel8_hv_lowpass_l2 put + h264_qpel8_hv_lowpass_l2 avg + +.macro h264_qpel16_hv type +function \type\()_h264_qpel16_hv_lowpass_neon + mov x13, x30 + bl \type\()_h264_qpel8_hv_lowpass_neon + sub x1, x1, x3, lsl #2 + bl \type\()_h264_qpel8_hv_lowpass_neon + sub x1, x1, x3, lsl #4 + sub x1, x1, x3, lsl #2 + add x1, x1, #8 + sub x0, x0, x2, lsl #4 + add x0, x0, #8 + bl \type\()_h264_qpel8_hv_lowpass_neon + sub x1, x1, x3, lsl #2 + mov x30, x13 + b \type\()_h264_qpel8_hv_lowpass_neon +endfunc + +function \type\()_h264_qpel16_hv_lowpass_l2_neon + mov x13, x30 + sub x2, x4, #256 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + sub x1, x1, x3, lsl #2 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + sub x1, x1, x3, lsl #4 + sub x1, x1, x3, lsl #2 + add x1, x1, #8 + sub x0, x0, x3, lsl #4 + add x0, x0, #8 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + sub x1, x1, x3, lsl #2 + mov x30, x13 + b \type\()_h264_qpel8_hv_lowpass_l2_neon +endfunc +.endm + + h264_qpel16_hv put + h264_qpel16_hv avg + +.macro h264_qpel8 type +function ff_\type\()_h264_qpel8_mc10_neon, export=1 + lowpass_const w3 + mov x3, x1 + sub x1, x1, #2 + mov x12, #8 + b \type\()_h264_qpel8_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel8_mc20_neon, export=1 + lowpass_const w3 + sub x1, x1, #2 + mov x3, x2 + mov x12, #8 + b \type\()_h264_qpel8_h_lowpass_neon +endfunc + +function ff_\type\()_h264_qpel8_mc30_neon, export=1 + lowpass_const w3 + add x3, x1, #1 + sub x1, x1, #2 + mov x12, #8 + b \type\()_h264_qpel8_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel8_mc01_neon, export=1 + mov x14, x30 + mov x12, x1 +\type\()_h264_qpel8_mc01: + lowpass_const w3 + mov x3, x2 + sub x1, x1, x2, lsl #1 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + ret x14 +endfunc + +function ff_\type\()_h264_qpel8_mc11_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 +\type\()_h264_qpel8_mc11: + lowpass_const w3 + mov x11, sp + sub sp, sp, #64 + mov x0, sp + sub x1, x1, #2 + mov x3, #8 + mov x12, #8 + bl put_h264_qpel8_h_lowpass_neon + mov x0, x8 + mov x3, x2 + mov x12, sp + sub x1, x9, x2, lsl #1 + mov x2, #8 + bl \type\()_h264_qpel8_v_lowpass_l2_neon + mov sp, x11 + ret x14 +endfunc + +function ff_\type\()_h264_qpel8_mc21_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 +\type\()_h264_qpel8_mc21: + lowpass_const w3 + mov x11, sp + sub sp, sp, #(8*8+16*12) + sub x1, x1, #2 + mov x3, #8 + mov x0, sp + mov x12, #8 + bl put_h264_qpel8_h_lowpass_neon + mov x4, x0 + mov x0, x8 + sub x1, x9, x2, lsl #1 + sub x1, x1, #2 + mov x3, x2 + sub x2, x4, #64 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + mov sp, x11 + ret x14 +endfunc + +function ff_\type\()_h264_qpel8_mc31_neon, export=1 + add x1, x1, #1 + mov x14, x30 + mov x8, x0 + mov x9, x1 + sub x1, x1, #1 + b \type\()_h264_qpel8_mc11 +endfunc + +function ff_\type\()_h264_qpel8_mc02_neon, export=1 + mov x14, x30 + lowpass_const w3 + sub x1, x1, x2, lsl #1 + mov x3, x2 + bl \type\()_h264_qpel8_v_lowpass_neon + ret x14 +endfunc + +function ff_\type\()_h264_qpel8_mc12_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 +\type\()_h264_qpel8_mc12: + lowpass_const w3 + mov x11, sp + sub sp, sp, #(8*8+16*12) + sub x1, x1, x2, lsl #1 + mov x3, x2 + mov x2, #8 + mov x0, sp + bl put_h264_qpel8_v_lowpass_neon + mov x4, x0 + mov x0, x8 + sub x1, x9, x3, lsl #1 + sub x1, x1, #2 + sub x2, x4, #64 + bl \type\()_h264_qpel8_hv_lowpass_l2_neon + mov sp, x11 + ret x14 +endfunc + +function ff_\type\()_h264_qpel8_mc22_neon, export=1 + mov x14, x30 + mov x11, sp + sub x1, x1, x2, lsl #1 + sub x1, x1, #2 + mov x3, x2 + bl \type\()_h264_qpel8_hv_lowpass_neon + mov sp, x11 + ret x14 +endfunc + +function ff_\type\()_h264_qpel8_mc32_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 + add x1, x1, #1 + b \type\()_h264_qpel8_mc12 +endfunc + +function ff_\type\()_h264_qpel8_mc03_neon, export=1 + mov x14, x30 + add x12, x1, x2 + b \type\()_h264_qpel8_mc01 +endfunc + +function ff_\type\()_h264_qpel8_mc13_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 + add x1, x1, x2 + b \type\()_h264_qpel8_mc11 +endfunc + +function ff_\type\()_h264_qpel8_mc23_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 + add x1, x1, x2 + b \type\()_h264_qpel8_mc21 +endfunc + +function ff_\type\()_h264_qpel8_mc33_neon, export=1 + add x1, x1, #1 + mov x14, x30 + mov x8, x0 + mov x9, x1 + add x1, x1, x2 + sub x1, x1, #1 + b \type\()_h264_qpel8_mc11 +endfunc +.endm + + h264_qpel8 put + h264_qpel8 avg + +.macro h264_qpel16 type +function ff_\type\()_h264_qpel16_mc10_neon, export=1 + lowpass_const w3 + mov x3, x1 + sub x1, x1, #2 + b \type\()_h264_qpel16_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel16_mc20_neon, export=1 + lowpass_const w3 + sub x1, x1, #2 + mov x3, x2 + b \type\()_h264_qpel16_h_lowpass_neon +endfunc + +function ff_\type\()_h264_qpel16_mc30_neon, export=1 + lowpass_const w3 + add x3, x1, #1 + sub x1, x1, #2 + b \type\()_h264_qpel16_h_lowpass_l2_neon +endfunc + +function ff_\type\()_h264_qpel16_mc01_neon, export=1 + mov x14, x30 + mov x12, x1 +\type\()_h264_qpel16_mc01: + lowpass_const w3 + mov x3, x2 + sub x1, x1, x2, lsl #1 + bl \type\()_h264_qpel16_v_lowpass_l2_neon + ret x14 +endfunc + +function ff_\type\()_h264_qpel16_mc11_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 +\type\()_h264_qpel16_mc11: + lowpass_const w3 + mov x11, sp + sub sp, sp, #256 + mov x0, sp + sub x1, x1, #2 + mov x3, #16 + bl put_h264_qpel16_h_lowpass_neon + mov x0, x8 + mov x3, x2 + mov x12, sp + sub x1, x9, x2, lsl #1 + mov x2, #16 + bl \type\()_h264_qpel16_v_lowpass_l2_neon + mov sp, x11 + ret x14 +endfunc + +function ff_\type\()_h264_qpel16_mc21_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 +\type\()_h264_qpel16_mc21: + lowpass_const w3 + mov x11, sp + sub sp, sp, #(16*16+16*12) + sub x1, x1, #2 + mov x0, sp + bl put_h264_qpel16_h_lowpass_neon_packed + mov x4, x0 + mov x0, x8 + sub x1, x9, x2, lsl #1 + sub x1, x1, #2 + mov x3, x2 + bl \type\()_h264_qpel16_hv_lowpass_l2_neon + mov sp, x11 + ret x14 +endfunc + +function ff_\type\()_h264_qpel16_mc31_neon, export=1 + add x1, x1, #1 + mov x14, x30 + mov x8, x0 + mov x9, x1 + sub x1, x1, #1 + b \type\()_h264_qpel16_mc11 +endfunc + +function ff_\type\()_h264_qpel16_mc02_neon, export=1 + mov x14, x30 + lowpass_const w3 + sub x1, x1, x2, lsl #1 + mov x3, x2 + bl \type\()_h264_qpel16_v_lowpass_neon + ret x14 +endfunc + +function ff_\type\()_h264_qpel16_mc12_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 +\type\()_h264_qpel16_mc12: + lowpass_const w3 + mov x11, sp + sub sp, sp, #(16*16+16*12) + sub x1, x1, x2, lsl #1 + mov x0, sp + mov x3, x2 + bl put_h264_qpel16_v_lowpass_neon_packed + mov x4, x0 + mov x0, x8 + sub x1, x9, x3, lsl #1 + sub x1, x1, #2 + mov x2, x3 + bl \type\()_h264_qpel16_hv_lowpass_l2_neon + mov sp, x11 + ret x14 +endfunc + +function ff_\type\()_h264_qpel16_mc22_neon, export=1 + mov x14, x30 + lowpass_const w3 + mov x11, sp + sub x1, x1, x2, lsl #1 + sub x1, x1, #2 + mov x3, x2 + bl \type\()_h264_qpel16_hv_lowpass_neon + mov sp, x11 // restore stack + ret x14 +endfunc + +function ff_\type\()_h264_qpel16_mc32_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 + add x1, x1, #1 + b \type\()_h264_qpel16_mc12 +endfunc + +function ff_\type\()_h264_qpel16_mc03_neon, export=1 + mov x14, x30 + add x12, x1, x2 + b \type\()_h264_qpel16_mc01 +endfunc + +function ff_\type\()_h264_qpel16_mc13_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 + add x1, x1, x2 + b \type\()_h264_qpel16_mc11 +endfunc + +function ff_\type\()_h264_qpel16_mc23_neon, export=1 + mov x14, x30 + mov x8, x0 + mov x9, x1 + add x1, x1, x2 + b \type\()_h264_qpel16_mc21 +endfunc + +function ff_\type\()_h264_qpel16_mc33_neon, export=1 + add x1, x1, #1 + mov x14, x30 + mov x8, x0 + mov x9, x1 + add x1, x1, x2 + sub x1, x1, #1 + b \type\()_h264_qpel16_mc11 +endfunc +.endm + + h264_qpel16 put + h264_qpel16 avg diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S index 3af9bcdabd..c4310d7bc0 100644 --- a/libavcodec/aarch64/neon.S +++ b/libavcodec/aarch64/neon.S @@ -16,6 +16,70 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 + trn1 \r8\().8B, \r0\().8B, \r1\().8B + trn2 \r9\().8B, \r0\().8B, \r1\().8B + trn1 \r1\().8B, \r2\().8B, \r3\().8B + trn2 \r3\().8B, \r2\().8B, \r3\().8B + trn1 \r0\().8B, \r4\().8B, \r5\().8B + trn2 \r5\().8B, \r4\().8B, \r5\().8B + trn1 \r2\().8B, \r6\().8B, \r7\().8B + trn2 \r7\().8B, \r6\().8B, \r7\().8B + + trn1 \r4\().4H, \r0\().4H, \r2\().4H + trn2 \r2\().4H, \r0\().4H, \r2\().4H + trn1 \r6\().4H, \r5\().4H, \r7\().4H + trn2 \r7\().4H, \r5\().4H, \r7\().4H + trn1 \r5\().4H, \r9\().4H, \r3\().4H + trn2 \r9\().4H, \r9\().4H, \r3\().4H + trn1 \r3\().4H, \r8\().4H, \r1\().4H + trn2 \r8\().4H, \r8\().4H, \r1\().4H + + trn1 \r0\().2S, \r3\().2S, \r4\().2S + trn2 \r4\().2S, \r3\().2S, \r4\().2S + + trn1 \r1\().2S, \r5\().2S, \r6\().2S + trn2 \r5\().2S, \r5\().2S, \r6\().2S + + trn2 \r6\().2S, \r8\().2S, \r2\().2S + trn1 \r2\().2S, \r8\().2S, \r2\().2S + + trn1 \r3\().2S, \r9\().2S, \r7\().2S + trn2 \r7\().2S, \r9\().2S, \r7\().2S +.endm + +.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 + trn1 \t0\().16B, \r0\().16B, \r1\().16B + trn2 \t1\().16B, \r0\().16B, \r1\().16B + trn1 \r1\().16B, \r2\().16B, \r3\().16B + trn2 \r3\().16B, \r2\().16B, \r3\().16B + trn1 \r0\().16B, \r4\().16B, \r5\().16B + trn2 \r5\().16B, \r4\().16B, \r5\().16B + trn1 \r2\().16B, \r6\().16B, \r7\().16B + trn2 \r7\().16B, \r6\().16B, \r7\().16B + + trn1 \r4\().8H, \r0\().8H, \r2\().8H + trn2 \r2\().8H, \r0\().8H, \r2\().8H + trn1 \r6\().8H, \r5\().8H, \r7\().8H + trn2 \r7\().8H, \r5\().8H, \r7\().8H + trn1 \r5\().8H, \t1\().8H, \r3\().8H + trn2 \t1\().8H, \t1\().8H, \r3\().8H + trn1 \r3\().8H, \t0\().8H, \r1\().8H + trn2 \t0\().8H, \t0\().8H, \r1\().8H + + trn1 \r0\().4S, \r3\().4S, \r4\().4S + trn2 \r4\().4S, \r3\().4S, \r4\().4S + + trn1 \r1\().4S, \r5\().4S, \r6\().4S + trn2 \r5\().4S, \r5\().4S, \r6\().4S + + trn2 \r6\().4S, \t0\().4S, \r2\().4S + trn1 \r2\().4S, \t0\().4S, \r2\().4S + + trn1 \r3\().4S, \t1\().4S, \r7\().4S + trn2 \r7\().4S, \t1\().4S, \r7\().4S +.endm + .macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7 trn1 \r4\().4H, \r0\().4H, \r1\().4H trn2 \r5\().4H, \r0\().4H, \r1\().4H -- cgit v1.2.3