/* * Copyright (c) 2010 Mans Rullgard * Copyright (c) 2015 Janne Grunau * * This file is part of Libav. * * Libav is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * Libav is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with Libav; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "asm-offsets.h" #include "libavutil/aarch64/asm.S" .macro inner_loop ld1 {v29.4s}, [x9], x15 ld1 {v28.4s}, [x8], x15 ld1 {v30.4s}, [x10], x15 ld1 {v31.4s}, [x11], x15 rev64 v28.4s, v28.4s ld1 {v24.4s}, [x4], x15 ld1 {v25.4s}, [x5], x15 rev64 v31.4s, v31.4s ld1 {v26.4s}, [x6], x15 fmla v5.4s, v25.4s, v29.4s ld1 {v27.4s}, [x7], x15 ext v28.16b, v28.16b, v28.16b, #8 ext v31.16b, v31.16b, v31.16b, #8 fmla v6.4s, v26.4s, v30.4s fmls v4.4s, v24.4s, v28.4s fmla v7.4s, v27.4s, v31.4s .endm function ff_synth_filter_float_neon, export=1 ldr w7, [x2] // *synth_buf_offset ldr x9, [x0, #IMDCT_HALF] // imdct_half function pointer sxtw x7, w7 stp x3, x4, [sp, #-64]! add x1, x1, x7, lsl #2 // synth_buf sub w8, w7, #32 stp x5, x1, [sp, #16] bic x7, x7, #63 and w8, w8, #511 stp x7, x30, [sp, #32] str w8, [x2] str s0, [sp, #48] mov x2, x6 // in blr x9 ldp x2, x4, [sp] // synct_buf_2, window ldp x13, x9, [sp, #16] // out, synth_buf ldp x0, x30, [sp, #32] // *synth_buf_offset ldr s0, [sp, #48] add x3, x2, #16*4 // synct_buf_2 + 16 add x14, x13, #16*4 // out + 16 add x8, x9, #12*4 mov x15, #64*4 mov x1, #4 1: add x10, x9, #16*4 // synth_buf add x11, x8, #16*4 add x5, x4, #16*4 // window add x6, x4, #32*4 add x7, x4, #48*4 ld1 {v4.4s}, [x2] // a ld1 {v5.4s}, [x3] // b movi v6.4s, #0 // c movi v7.4s, #0 // d mov x12, #512 2: sub x12, x12, #64 cmp x12, x0 inner_loop b.gt 2b sub x8, x8, #512*4 sub x9, x9, #512*4 cbz x12, 4f sub x10, x10, #512*4 sub x11, x11, #512*4 3: subs x12, x12, #64 inner_loop b.gt 3b 4: subs x1, x1, #1 fmul v4.4s, v4.4s, v0.s[0] fmul v5.4s, v5.4s, v0.s[0] st1 {v6.4s}, [x2], #16 st1 {v7.4s}, [x3], #16 st1 {v4.4s}, [x13], #16 st1 {v5.4s}, [x14], #16 b.le 10f sub x4, x4, #508*4 // window add x9, x9, #4*4 // synth_buf sub x8, x8, #4*4 // synth_buf b 1b 10: add sp, sp, #64 ret endfunc