From 8f9fe6ae3461ce270bce6b7083fda5ec314cdad4 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Sat, 19 Apr 2014 18:17:23 +0200 Subject: aarch64: NEON fixed/floating point MPADSP apply_window 30%/25% (fixed/float) faster mp3 decoding on Apple's A7. The floating point decoder is approximately 7% faster. --- libavcodec/aarch64/mpegaudiodsp_neon.S | 226 +++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 libavcodec/aarch64/mpegaudiodsp_neon.S (limited to 'libavcodec/aarch64/mpegaudiodsp_neon.S') diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S b/libavcodec/aarch64/mpegaudiodsp_neon.S new file mode 100644 index 0000000000..39875fed4a --- /dev/null +++ b/libavcodec/aarch64/mpegaudiodsp_neon.S @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2014 Janne Grunau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +#define FRAC_BITS 23 // fractional bits for sb_samples and dct +#define WFRAC_BITS 16 // fractional bits for window +#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15) + +const tbl_rev128.s align=4 + .byte 12, 13, 14, 15 + .byte 8, 9, 10, 11 + .byte 4, 5, 6, 7 + .byte 0, 1, 2, 3 +endconst + +.macro apply_window type, st +function ff_mpadsp_apply_window_\type\()_neon, export=1 + mov x7, x0 + sxtw x4, w4 // incr + add x8, x0, #512<<2 + ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x7], #64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x7], #64 + st1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x8], #64 + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x8], #64 + movrel x15, tbl_rev128.s + ld1 {v27.4s}, [x15] +.ifc \type, fixed + lsl x4, x4, #1 +.else + lsl x4, x4, #2 +.endif + add x10, x0, #45<<2 + add x0, x0, #16<<2 + add x1, x1, #16<<2 + add x5, x3, x4, lsl #5 + sub x5, x5, x4 // samples2 + neg x13, x4 // -incr + mov x9, #64<<2 +.ifc \type, fixed + ld1r {v16.2s}, [x2] // dither_state + sxtl v16.2d, v16.2s + movi v29.2d, #0 + movi v30.2d, #(1<