summaryrefslogtreecommitdiff
path: root/libavcodec/aarch64
diff options
context:
space:
mode:
authorJanne Grunau <janne-libav@jannau.net>2015-12-01 13:37:41 +0100
committerJanne Grunau <janne-libav@jannau.net>2015-12-14 16:45:01 +0100
commit705f5e5e155f6f280a360af220fc5b30cfcee702 (patch)
tree0ae61550e51a1843de9ee7fb40e3143bd361e4ad /libavcodec/aarch64
parentc33c1fa8af2b2e82418a06901b6ad17b3d61b73e (diff)
arm64: port synth_filter_float_neon from arm
~25% faster dts decoding overall. The checkasm CPU cycles numbers are not that useful since synth_filter_float() calls FFTContext.imdct_half(). cortex-a57 cortex-a53 synth_filter_float_c: 1866.2 3490.9 synth_filter_float_neon: 915.0 1531.5 With fftc.imdct_half forced to imdct_half_neon: cortex-a57 cortex-a53 synth_filter_float_c: 1718.4 3025.3 synth_filter_float_neon: 926.2 1530.1
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r--libavcodec/aarch64/Makefile3
-rw-r--r--libavcodec/aarch64/asm-offsets.h3
-rw-r--r--libavcodec/aarch64/dcadsp_init.c16
-rw-r--r--libavcodec/aarch64/synth_filter_neon.S119
4 files changed, 140 insertions, 1 deletions
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 0b614a3ac2..2175578f8e 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -16,7 +16,8 @@ OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
-NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o
+NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \
+ aarch64/synth_filter_neon.o
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
diff --git a/libavcodec/aarch64/asm-offsets.h b/libavcodec/aarch64/asm-offsets.h
index 45b5c40f80..60e32ddd1d 100644
--- a/libavcodec/aarch64/asm-offsets.h
+++ b/libavcodec/aarch64/asm-offsets.h
@@ -27,4 +27,7 @@
#define CELT_TMP 0x10
#define CELT_TWIDDLE (CELT_TMP + 0x8) // loaded as pair
+/* FFTContext */
+#define IMDCT_HALF 0x48
+
#endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */
diff --git a/libavcodec/aarch64/dcadsp_init.c b/libavcodec/aarch64/dcadsp_init.c
index ad910700f0..c66ec3f538 100644
--- a/libavcodec/aarch64/dcadsp_init.c
+++ b/libavcodec/aarch64/dcadsp_init.c
@@ -22,7 +22,15 @@
#include "libavutil/aarch64/cpu.h"
#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
#include "libavcodec/dcadsp.h"
+#include "libavcodec/fft.h"
+
+#include "asm-offsets.h"
+
+#if HAVE_NEON || HAVE_VFP
+AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
+#endif
void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
@@ -49,3 +57,11 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
s->decode_hf = ff_decode_hf_neon;
}
}
+
+av_cold void ff_synth_filter_init_aarch64(SynthFilterContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ s->synth_filter_float = ff_synth_filter_float_neon;
+}
diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S
new file mode 100644
index 0000000000..9551bff8e3
--- /dev/null
+++ b/libavcodec/aarch64/synth_filter_neon.S
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm-offsets.h"
+
+#include "libavutil/aarch64/asm.S"
+
+.macro inner_loop
+ ld1 {v29.4s}, [x9], x15
+ ld1 {v28.4s}, [x8], x15
+ ld1 {v30.4s}, [x10], x15
+ ld1 {v31.4s}, [x11], x15
+ rev64 v28.4s, v28.4s
+ ld1 {v24.4s}, [x4], x15
+ ld1 {v25.4s}, [x5], x15
+ rev64 v31.4s, v31.4s
+ ld1 {v26.4s}, [x6], x15
+ fmla v5.4s, v25.4s, v29.4s
+ ld1 {v27.4s}, [x7], x15
+ ext v28.16b, v28.16b, v28.16b, #8
+ ext v31.16b, v31.16b, v31.16b, #8
+ fmla v6.4s, v26.4s, v30.4s
+ fmls v4.4s, v24.4s, v28.4s
+ fmla v7.4s, v27.4s, v31.4s
+.endm
+
+function ff_synth_filter_float_neon, export=1
+ ldr w7, [x2] // *synth_buf_offset
+ ldr x9, [x0, #IMDCT_HALF] // imdct_half function pointer
+ sxtw x7, w7
+ stp x3, x4, [sp, #-64]!
+ add x1, x1, x7, lsl #2 // synth_buf
+ sub w8, w7, #32
+ stp x5, x1, [sp, #16]
+ bic x7, x7, #63
+ and w8, w8, #511
+ stp x7, x30, [sp, #32]
+ str w8, [x2]
+ str s0, [sp, #48]
+
+ mov x2, x6 // in
+
+ blr x9
+
+ ldp x2, x4, [sp] // synct_buf_2, window
+ ldp x13, x9, [sp, #16] // out, synth_buf
+ ldp x0, x30, [sp, #32] // *synth_buf_offset
+ ldr s0, [sp, #48]
+
+ add x3, x2, #16*4 // synct_buf_2 + 16
+ add x14, x13, #16*4 // out + 16
+ add x8, x9, #12*4
+ mov x15, #64*4
+ mov x1, #4
+1:
+ add x10, x9, #16*4 // synth_buf
+ add x11, x8, #16*4
+ add x5, x4, #16*4 // window
+ add x6, x4, #32*4
+ add x7, x4, #48*4
+
+ ld1 {v4.4s}, [x2] // a
+ ld1 {v5.4s}, [x3] // b
+ movi v6.4s, #0 // c
+ movi v7.4s, #0 // d
+
+ mov x12, #512
+2:
+ sub x12, x12, #64
+ cmp x12, x0
+ inner_loop
+ b.gt 2b
+
+ sub x8, x8, #512*4
+ sub x9, x9, #512*4
+ cbz x12, 4f
+ sub x10, x10, #512*4
+ sub x11, x11, #512*4
+3:
+ subs x12, x12, #64
+ inner_loop
+ b.gt 3b
+4:
+ subs x1, x1, #1
+ fmul v4.4s, v4.4s, v0.s[0]
+ fmul v5.4s, v5.4s, v0.s[0]
+ st1 {v6.4s}, [x2], #16
+ st1 {v7.4s}, [x3], #16
+ st1 {v4.4s}, [x13], #16
+ st1 {v5.4s}, [x14], #16
+ b.le 10f
+
+ sub x4, x4, #508*4 // window
+ add x9, x9, #4*4 // synth_buf
+ sub x8, x8, #4*4 // synth_buf
+ b 1b
+
+10:
+ add sp, sp, #64
+ ret
+endfunc