diff options
Diffstat (limited to 'libavresample/aarch64/resample_neon.S')
-rw-r--r-- | libavresample/aarch64/resample_neon.S | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/libavresample/aarch64/resample_neon.S b/libavresample/aarch64/resample_neon.S new file mode 100644 index 0000000000..d3c2cbf561 --- /dev/null +++ b/libavresample/aarch64/resample_neon.S @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "asm-offsets.h" + +.macro resample_one fmt, es=2 +.ifnc \fmt, dbl + .macro M_MUL2 x:vararg + .endm + .macro M_MLA2 x:vararg + .endm +.endif +function ff_resample_one_\fmt\()_neon, export=1 + sxtw x2, w2 + ldr x9, [x0, #FILTER_BANK] + ldr w6, [x0, #FILTER_LENGTH] + ldp w7, w8, [x0, #PHASE_SHIFT] // and phase_mask + lsr x10, x4, x7 // sample_index + and x4, x4, x8 + lsl x11, x6, #\es // filter_length * elem_size + add x3, x3, x10, lsl #\es // src[sample_index] + madd x9, x11, x4, x9 // filter + cmp w6, #16 + b.lt 5f +8: // remaining filter_length at least 16 + subs w6, w6, #16 + LOAD8 v4, v5, v6, v7, x3 + LOAD8 v16, v17, v18, v19, x9 + M_MUL v0, v4, v16, v1 + M_MUL2 v1, v6, v18 +7: + LOAD8 v20, v21, v22, v23, x3 + M_MLA v0, v5, v17, v1 + M_MLA2 v1, v7, v19 + LOAD8 v24, v25, v26, v27, x9 + M_MLA v0, v20, v24, v1 + M_MLA2 v1, v22, v26 + b.eq 6f + cmp w6, #16 + M_MLA v0, v21, v25, v1 + M_MLA2 v1, v23, v27 + b.lt 4f + subs w6, w6, #16 + LOAD8 v4, v5, v6, v7, x3 + LOAD8 v16, v17, v18, v19, x9 + M_MLA v0, v4, v16, v1 + M_MLA2 v1, v6, v18 + b 7b +6: + M_MLA v0, v21, v25, v1 + M_MLA2 v1, v23, v27 + STORE_ONE 0, x1, x2, v1 + ret +5: + movi v0.16b, #0 + movi v1.16b, #0 +4: // remaining filter_length 1-15 + cmp w6, #4 + b.lt 2f + subs w6, w6, #4 + LOAD4 v4, v5, x3 + LOAD4 v6, v7, x9 + M_MLA v0, v4, v6, v1 + M_MLA2 v1, v5, v7 + b.eq 0f + b 4b +2: // remaining filter_length 1-3 + cmp w6, #2 + b.lt 1f + LOAD2 2, x3 + LOAD2 3, x9 + subs w6, w6, #2 + M_MLA v0, v2, v3 + b.eq 0f +1: // remaining filter_length 1 + LOAD1 6, x3 + LOAD1 7, x9 + M_MLA v0, v6, v7 +0: + STORE_ONE 0, x1, x2, v1 + ret +endfunc + +.purgem LOAD1 +.purgem LOAD2 +.purgem LOAD4 +.purgem LOAD8 +.purgem M_MLA +.purgem M_MLA2 +.purgem M_MUL +.purgem M_MUL2 +.purgem STORE_ONE +.endm + + +.macro LOAD1 d1, addr + ldr d\d1, [\addr], #8 +.endm +.macro LOAD2 d1, addr + ld1 {v\d1\().2d}, [\addr], #16 +.endm +.macro LOAD4 d1, d2, addr + ld1 {\d1\().2d,\d2\().2d}, [\addr], #32 +.endm +.macro LOAD8 d1, d2, d3, d4, addr + ld1 {\d1\().2d,\d2\().2d,\d3\().2d,\d4\().2d}, [\addr], #64 +.endm +.macro M_MLA d, r0, r1, d2:vararg + fmla \d\().2d, \r0\().2d, \r1\().2d +.endm +.macro M_MLA2 second:vararg + M_MLA \second +.endm +.macro M_MUL d, r0, r1, d2:vararg + fmul \d\().2d, \r0\().2d, \r1\().2d +.endm +.macro M_MUL2 second:vararg + M_MUL \second +.endm +.macro STORE_ONE rn, addr, idx, d2 + fadd v\rn\().2d, v\rn\().2d, \d2\().2d + faddp d\rn\(), v\rn\().2d + str d\rn\(), [\addr, \idx, lsl #3] +.endm + +resample_one dbl, 3 + + +.macro LOAD1 d1, addr + ldr s\d1, [\addr], #4 +.endm +.macro LOAD2 d1, addr + ld1 {v\d1\().2s}, [\addr], #8 +.endm +.macro LOAD4 d1, d2, addr + ld1 {\d1\().4s}, [\addr], #16 +.endm +.macro LOAD8 d1, d2, d3, d4, addr + ld1 {\d1\().4s,\d2\().4s}, [\addr], #32 +.endm +.macro M_MLA d, r0, r1, d2:vararg + fmla \d\().4s, \r0\().4s, \r1\().4s +.endm +.macro M_MUL d, r0, r1, d2:vararg + fmul \d\().4s, \r0\().4s, \r1\().4s +.endm +.macro STORE_ONE rn, addr, idx, d2 + faddp v\rn\().4s, v\rn\().4s, v\rn\().4s + faddp s\rn\(), v\rn\().2s + str s\rn\(), [\addr, \idx, lsl #2] +.endm + +resample_one flt + + +.macro LOAD1 d1, addr + ldr h\d1, [\addr], #2 +.endm +.macro LOAD2 d1, addr + ldr s\d1, [\addr], #4 +.endm +.macro LOAD4 d1, d2, addr + ld1 {\d1\().4h}, [\addr], #8 +.endm +.macro LOAD8 d1, d2, d3, d4, addr + ld1 {\d1\().4h,\d2\().4h}, [\addr], #16 +.endm +.macro M_MLA d, r0, r1, d2:vararg + smlal \d\().4s, \r0\().4h, \r1\().4h +.endm +.macro M_MUL d, r0, r1, d2:vararg + smull \d\().4s, \r0\().4h, \r1\().4h +.endm +.macro STORE_ONE rn, addr, idx, d2 + addp v\rn\().4s, v\rn\().4s, v\rn\().4s + addp v\rn\().4s, v\rn\().4s, v\rn\().4s + sqrshrn v\rn\().4h, v\rn\().4s, #15 + str h\rn\(), [\addr, \idx, lsl #1] +.endm + +resample_one s16, 1 + + +.macro LOAD1 d1, addr + ldr s\d1, [\addr], #4 +.endm +.macro LOAD2 d1, addr + ld1 {v\d1\().2s}, [\addr], #8 +.endm +.macro LOAD4 d1, d2, addr + ld1 {\d1\().4s}, [\addr], #16 +.endm +.macro LOAD8 d1, d2, d3, d4, addr + ld1 {\d1\().4s,\d2\().4s}, [\addr], #32 +.endm +.macro M_MLA d1, r0, r1, d2:vararg + smlal \d1\().2d, \r0\().2s, \r1\().2s +.ifnb \d2 + smlal2 \d2\().2d, \r0\().4s, \r1\().4s +.endif +.endm +.macro M_MUL d1, r0, r1, d2:vararg + smull \d1\().2d, \r0\().2s, \r1\().2s +.ifnb \d2 + smull2 \d2\().2d, \r0\().4s, \r1\().4s +.endif +.endm +.macro STORE_ONE rn, addr, idx, d2 + add v\rn\().2d, v\rn\().2d, \d2\().2d + addp d\rn\(), v\rn\().2d + sqrshrn v\rn\().2s, v\rn\().2d, #30 + str s\rn\(), [\addr, \idx, lsl #2] +.endm + +resample_one s32 |