summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMans Rullgard <mans@mansr.com>2012-01-12 23:44:20 +0000
committerMans Rullgard <mans@mansr.com>2012-01-28 14:56:18 +0000
commitbe822d77b6f8363df0a49c568662187d655711e2 (patch)
treee22a09fb71a3492d4fb2e8fae5d00ffa9c7e14fc
parent8996ed2b7324f35798c4198b495f8a6955d2047d (diff)
aacsbr: ARM NEON optimised sbrdsp functions
Overall speedup of HE-AAC decoding 2.3x on Cortex-A8, 1.2x on A9. Signed-off-by: Mans Rullgard <mans@mansr.com>
-rw-r--r--libavcodec/arm/Makefile4
-rw-r--r--libavcodec/arm/sbrdsp_init_arm.c70
-rw-r--r--libavcodec/arm/sbrdsp_neon.S411
-rw-r--r--libavcodec/sbrdsp.c4
-rw-r--r--libavcodec/sbrdsp.h1
5 files changed, 490 insertions, 0 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index fc1711395b..e7fa7e511a 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -1,6 +1,8 @@
OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \
arm/ac3dsp_arm.o
+OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_init_arm.o
+
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
@@ -60,6 +62,8 @@ NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o \
NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
+NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_neon.o
+
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
arm/synth_filter_neon.o \
diff --git a/libavcodec/arm/sbrdsp_init_arm.c b/libavcodec/arm/sbrdsp_init_arm.c
new file mode 100644
index 0000000000..2ab0df829d
--- /dev/null
+++ b/libavcodec/arm/sbrdsp_init_arm.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2012 Mans Rullgard
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/sbrdsp.h"
+
+void ff_sbr_sum64x5_neon(float *z);
+float ff_sbr_sum_square_neon(float (*x)[2], int n);
+void ff_sbr_neg_odd_64_neon(float *x);
+void ff_sbr_qmf_pre_shuffle_neon(float *z);
+void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
+void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
+void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
+ const float *g_filt, int m_max, int ixh);
+void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
+ const float alpha0[2], const float alpha1[2],
+ float bw, int start, int end);
+void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
+
+void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+
+av_cold void ff_sbrdsp_init_arm(SBRDSPContext *s)
+{
+ if (HAVE_NEON) {
+ s->sum64x5 = ff_sbr_sum64x5_neon;
+ s->sum_square = ff_sbr_sum_square_neon;
+ s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
+ s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
+ s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
+ s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
+ s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
+ s->hf_g_filt = ff_sbr_hf_g_filt_neon;
+ s->hf_gen = ff_sbr_hf_gen_neon;
+ s->autocorrelate = ff_sbr_autocorrelate_neon;
+ s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
+ s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
+ s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
+ s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
+ }
+}
diff --git a/libavcodec/arm/sbrdsp_neon.S b/libavcodec/arm/sbrdsp_neon.S
new file mode 100644
index 0000000000..835c32caee
--- /dev/null
+++ b/libavcodec/arm/sbrdsp_neon.S
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2012 Mans Rullgard
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+function ff_sbr_sum64x5_neon, export=1
+ push {lr}
+ add r1, r0, # 64*4
+ add r2, r0, #128*4
+ add r3, r0, #192*4
+ add lr, r0, #256*4
+ mov r12, #64
+1:
+ vld1.32 {q0}, [r0,:128]
+ vld1.32 {q1}, [r1,:128]!
+ vadd.f32 q0, q0, q1
+ vld1.32 {q2}, [r2,:128]!
+ vadd.f32 q0, q0, q2
+ vld1.32 {q3}, [r3,:128]!
+ vadd.f32 q0, q0, q3
+ vld1.32 {q8}, [lr,:128]!
+ vadd.f32 q0, q0, q8
+ vst1.32 {q0}, [r0,:128]!
+ subs r12, #4
+ bgt 1b
+ pop {pc}
+endfunc
+
+function ff_sbr_sum_square_neon, export=1
+ vmov.f32 q0, #0.0
+1:
+ vld1.32 {q1}, [r0,:128]!
+ vmla.f32 q0, q1, q1
+ subs r1, r1, #2
+ bgt 1b
+ vadd.f32 d0, d0, d1
+ vpadd.f32 d0, d0, d0
+NOVFP vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function ff_sbr_neg_odd_64_neon, export=1
+ mov r1, r0
+ vmov.i32 q8, #1<<31
+ vld2.32 {q0,q1}, [r0,:128]!
+ veor q1, q1, q8
+ vld2.32 {q2,q3}, [r0,:128]!
+ .rept 3
+ vst2.32 {q0,q1}, [r1,:128]!
+ veor q3, q3, q8
+ vld2.32 {q0,q1}, [r0,:128]!
+ vst2.32 {q2,q3}, [r1,:128]!
+ veor q1, q1, q8
+ vld2.32 {q2,q3}, [r0,:128]!
+ .endr
+ veor q3, q3, q8
+ vst2.32 {q0,q1}, [r1,:128]!
+ vst2.32 {q2,q3}, [r1,:128]!
+ bx lr
+endfunc
+
+function ff_sbr_qmf_pre_shuffle_neon, export=1
+ add r1, r0, #60*4
+ add r2, r0, #64*4
+ vld1.32 {d0}, [r0,:64]!
+ vst1.32 {d0}, [r2,:64]!
+ mov r3, #-16
+ mov r12, #24
+ vmov.i32 q8, #1<<31
+ vld1.32 {q0}, [r1,:128], r3
+ vld1.32 {d2}, [r0,:64]!
+1:
+ vld1.32 {d3,d4}, [r0,:128]!
+ vrev64.32 q0, q0
+ vld1.32 {q9}, [r1,:128], r3
+ veor q0, q0, q8
+ vld1.32 {d5,d6}, [r0,:128]!
+ vswp d0, d1
+ vrev64.32 q9, q9
+ vst2.32 {q0,q1}, [r2,:64]!
+ vmov q10, q2
+ veor q9, q9, q8
+ vmov d2, d6
+ vswp d18, d19
+ vld1.32 {q0}, [r1,:128], r3
+ vst2.32 {q9,q10}, [r2,:64]!
+ subs r12, r12, #8
+ bgt 1b
+ vld1.32 {d3,d4}, [r0,:128]!
+ vrev64.32 q0, q0
+ vld1.32 {q9}, [r1,:128], r3
+ veor q0, q0, q8
+ vld1.32 {d5}, [r0,:64]!
+ vswp d0, d1
+ vrev64.32 q9, q9
+ vst2.32 {q0,q1}, [r2,:64]!
+ vswp d4, d5
+ veor q1, q9, q8
+ vst2.32 {d3,d5}, [r2,:64]!
+ vst2.32 {d2[0],d4[0]}, [r2,:64]!
+ bx lr
+endfunc
+
+function ff_sbr_qmf_post_shuffle_neon, export=1
+ add r2, r1, #60*4
+ mov r3, #-16
+ mov r12, #32
+ vmov.i32 q8, #1<<31
+ vld1.32 {q0}, [r2,:128], r3
+ vld1.32 {q1}, [r1,:128]!
+1:
+ pld [r2, #-32]
+ vrev64.32 q0, q0
+ vswp d2, d3
+ veor q0, q0, q8
+ vld1.32 {q2}, [r2,:128], r3
+ vld1.32 {q3}, [r1,:128]!
+ vst2.32 {d1,d3}, [r0,:128]!
+ vst2.32 {d0,d2}, [r0,:128]!
+ pld [r2, #-32]
+ vrev64.32 q2, q2
+ vswp d6, d7
+ veor q2, q2, q8
+ vld1.32 {q0}, [r2,:128], r3
+ vld1.32 {q1}, [r1,:128]!
+ vst2.32 {d5,d7}, [r0,:128]!
+ vst2.32 {d4,d6}, [r0,:128]!
+ subs r12, r12, #8
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_sbr_qmf_deint_neg_neon, export=1
+ add r1, r1, #60*4
+ add r2, r0, #62*4
+ mov r3, #-16
+ mov r12, #32
+ vmov.i32 d2, #1<<31
+1:
+ vld2.32 {d0,d1}, [r1,:128], r3
+ veor d0, d0, d2
+ vrev64.32 d1, d1
+ vst1.32 {d0}, [r2,:64]
+ vst1.32 {d1}, [r0,:64]!
+ sub r2, r2, #8
+ subs r12, r12, #2
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_sbr_qmf_deint_bfly_neon, export=1
+ push {lr}
+ add r2, r2, #60*4
+ add r3, r0, #124*4
+ mov r12, #64
+ mov lr, #-16
+1:
+ vld1.32 {q0}, [r1,:128]!
+ vld1.32 {q1}, [r2,:128], lr
+ vrev64.32 q2, q0
+ vrev64.32 q3, q1
+ vadd.f32 d3, d4, d3
+ vadd.f32 d2, d5, d2
+ vsub.f32 d0, d0, d7
+ vsub.f32 d1, d1, d6
+ vst1.32 {q1}, [r3,:128], lr
+ vst1.32 {q0}, [r0,:128]!
+ subs r12, r12, #4
+ bgt 1b
+ pop {pc}
+endfunc
+
+function ff_sbr_hf_g_filt_neon, export=1
+ ldr r12, [sp]
+ add r1, r1, r12, lsl #3
+ mov r12, #40*2*4
+ sub r3, r3, #1
+ vld2.32 {d2[],d3[]},[r2,:64]!
+ vld1.32 {d0}, [r1,:64], r12
+1:
+ vld1.32 {d1}, [r1,:64], r12
+ vmul.f32 q3, q0, q1
+ vld2.32 {d2[],d3[]},[r2,:64]!
+ vld1.32 {d0}, [r1,:64], r12
+ vst1.32 {q3}, [r0,:64]!
+ subs r3, r3, #2
+ bgt 1b
+ it lt
+ bxlt lr
+ vmul.f32 d0, d0, d2
+ vst1.32 {d0}, [r0,:64]!
+ bx lr
+endfunc
+
+function ff_sbr_hf_gen_neon, export=1
+NOVFP vld1.32 {d1[]}, [sp,:32]
+VFP vdup.32 d1, d0[0]
+ vmul.f32 d0, d1, d1
+ vld1.32 {d3}, [r2,:64]
+ vld1.32 {d2}, [r3,:64]
+ vmul.f32 q0, q0, q1
+ ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS]
+ vtrn.32 d0, d1
+ vneg.f32 d18, d1
+ vtrn.32 d18, d1
+ add r0, r0, r2, lsl #3
+ add r1, r1, r2, lsl #3
+ sub r1, r1, #2*8
+ sub r3, r3, r2
+ vld1.32 {q1}, [r1,:128]!
+1:
+ vld1.32 {q3}, [r1,:128]!
+ vrev64.32 q2, q1
+ vmov q8, q3
+ vrev64.32 d20, d3
+ vrev64.32 d21, d6
+ vmla.f32 q3, q1, d0[0]
+ vmla.f32 d6, d4, d18
+ vmla.f32 d7, d20, d18
+ vmla.f32 d6, d3, d0[1]
+ vmla.f32 d7, d16, d0[1]
+ vmla.f32 d6, d5, d1
+ vmla.f32 d7, d21, d1
+ vmov q1, q8
+ vst1.32 {q3}, [r0,:128]!
+ subs r3, r3, #2
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_sbr_autocorrelate_neon, export=1
+ vld1.32 {q0}, [r0,:128]!
+ vmov.f32 q1, #0.0
+ vmov.f32 q3, #0.0
+ vmov.f32 d20, #0.0
+ vmul.f32 d21, d1, d1
+ vmov q8, q0
+ vmov q11, q0
+ mov r12, #36
+1:
+ vld1.32 {q2}, [r0,:128]!
+ vrev64.32 q12, q2
+ vmla.f32 q10, q2, q2
+ vmla.f32 d2, d1, d4
+ vmla.f32 d3, d1, d24
+ vmla.f32 d6, d0, d4
+ vmla.f32 d7, d0, d24
+ vmla.f32 d2, d4, d5
+ vmla.f32 d3, d4, d25
+ vmla.f32 d6, d1, d5
+ vmla.f32 d7, d1, d25
+ vmov q0, q2
+ subs r12, r12, #2
+ bgt 1b
+ vld1.32 {q2}, [r0,:128]!
+ vrev64.32 q12, q2
+ vmla.f32 d2, d1, d4
+ vmla.f32 d3, d1, d24
+ vmla.f32 d6, d0, d4
+ vmla.f32 d7, d0, d24
+ vadd.f32 d20, d20, d21
+ vrev64.32 d18, d17
+ vmla.f32 d6, d1, d5
+ vmla.f32 d7, d1, d25
+ vmov q0, q1
+ vmla.f32 d0, d16, d17
+ vmla.f32 d1, d16, d18
+ vmla.f32 d2, d4, d5
+ vmla.f32 d3, d4, d25
+ vneg.f32 s15, s15
+ vmov d21, d20
+ vpadd.f32 d0, d0, d2
+ vpadd.f32 d7, d6, d7
+ vtrn.32 d1, d3
+ vsub.f32 d6, d1, d3
+ vmla.f32 d20, d22, d22
+ vmla.f32 d21, d4, d4
+ vtrn.32 d0, d6
+ vpadd.f32 d20, d20, d21
+ vst1.32 {q3}, [r1,:128]!
+ vst1.32 {d20[1]}, [r1,:32]
+ add r1, r1, #2*4
+ vst1.32 {d0}, [r1,:64]
+ add r1, r1, #4*4
+ vst1.32 {d20[0]}, [r1,:32]
+ bx lr
+endfunc
+
+function ff_sbr_hf_apply_noise_0_neon, export=1
+ vmov.i32 d3, #0
+.Lhf_apply_noise_0:
+ push {r4,lr}
+ ldr r12, [sp, #12]
+ movrel r4, X(ff_sbr_noise_table)
+ add r3, r3, #1
+ bfc r3, #9, #23
+ sub r12, r12, #1
+1:
+ add lr, r4, r3, lsl #3
+ vld2.32 {q0}, [r0,:64]
+ vld2.32 {q3}, [lr,:64]
+ vld1.32 {d2}, [r1,:64]!
+ vld1.32 {d18}, [r2,:64]!
+ vceq.f32 d16, d2, #0
+ veor d2, d2, d3
+ vmov q2, q0
+ vmla.f32 d0, d6, d18
+ vmla.f32 d1, d7, d18
+ vadd.f32 d4, d4, d2
+ add r3, r3, #2
+ bfc r3, #9, #23
+ vbif d0, d4, d16
+ vbif d1, d5, d16
+ vst2.32 {q0}, [r0,:64]!
+ subs r12, r12, #2
+ bgt 1b
+ blt 2f
+ add lr, r4, r3, lsl #3
+ vld1.32 {d0}, [r0,:64]
+ vld1.32 {d6}, [lr,:64]
+ vld1.32 {d2[]}, [r1,:32]!
+ vld1.32 {d3[]}, [r2,:32]!
+ vceq.f32 d4, d2, #0
+ veor d2, d2, d3
+ vmov d1, d0
+ vmla.f32 d0, d6, d3
+ vadd.f32 s2, s2, s4
+ vbif d0, d1, d4
+ vst1.32 {d0}, [r0,:64]!
+2:
+ pop {r4,pc}
+endfunc
+
+function ff_sbr_hf_apply_noise_1_neon, export=1
+ ldr r12, [sp]
+ push {r4,lr}
+ lsl r12, r12, #31
+ eor lr, r12, #1<<31
+ vmov d3, r12, lr
+.Lhf_apply_noise_1:
+ ldr r12, [sp, #12]
+ movrel r4, X(ff_sbr_noise_table)
+ add r3, r3, #1
+ bfc r3, #9, #23
+ sub r12, r12, #1
+1:
+ add lr, r4, r3, lsl #3
+ vld2.32 {q0}, [r0,:64]
+ vld2.32 {q3}, [lr,:64]
+ vld1.32 {d2}, [r1,:64]!
+ vld1.32 {d18}, [r2,:64]!
+ vceq.f32 d16, d2, #0
+ veor d2, d2, d3
+ vmov q2, q0
+ vmla.f32 d0, d6, d18
+ vmla.f32 d1, d7, d18
+ vadd.f32 d5, d5, d2
+ add r3, r3, #2
+ bfc r3, #9, #23
+ vbif d0, d4, d16
+ vbif d1, d5, d16
+ vst2.32 {q0}, [r0,:64]!
+ subs r12, r12, #2
+ bgt 1b
+ blt 2f
+ add lr, r4, r3, lsl #3
+ vld1.32 {d0}, [r0,:64]
+ vld1.32 {d6}, [lr,:64]
+ vld1.32 {d2[]}, [r1,:32]!
+ vld1.32 {d18[]}, [r2,:32]!
+ vceq.f32 d4, d2, #0
+ veor d2, d2, d3
+ vmov d1, d0
+ vmla.f32 d0, d6, d18
+ vadd.f32 s3, s3, s5
+ vbif d0, d1, d4
+ vst1.32 {d0}, [r0,:64]!
+2:
+ pop {r4,pc}
+endfunc
+
+function ff_sbr_hf_apply_noise_2_neon, export=1
+ vmov.i32 d3, #1<<31
+ b .Lhf_apply_noise_0
+endfunc
+
+function ff_sbr_hf_apply_noise_3_neon, export=1
+ ldr r12, [sp]
+ push {r4,lr}
+ lsl r12, r12, #31
+ eor lr, r12, #1<<31
+ vmov d3, lr, r12
+ b .Lhf_apply_noise_1
+endfunc
diff --git a/libavcodec/sbrdsp.c b/libavcodec/sbrdsp.c
index 7be962ed77..2711e71338 100644
--- a/libavcodec/sbrdsp.c
+++ b/libavcodec/sbrdsp.c
@@ -20,6 +20,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "config.h"
#include "libavutil/attributes.h"
#include "sbrdsp.h"
@@ -234,4 +235,7 @@ av_cold void ff_sbrdsp_init(SBRDSPContext *s)
s->hf_apply_noise[1] = sbr_hf_apply_noise_1;
s->hf_apply_noise[2] = sbr_hf_apply_noise_2;
s->hf_apply_noise[3] = sbr_hf_apply_noise_3;
+
+ if (ARCH_ARM)
+ ff_sbrdsp_init_arm(s);
}
diff --git a/libavcodec/sbrdsp.h b/libavcodec/sbrdsp.h
index 2f6cf1e4c5..88285b07ec 100644
--- a/libavcodec/sbrdsp.h
+++ b/libavcodec/sbrdsp.h
@@ -43,5 +43,6 @@ typedef struct SBRDSPContext {
extern const float ff_sbr_noise_table[][2];
void ff_sbrdsp_init(SBRDSPContext *s);
+void ff_sbrdsp_init_arm(SBRDSPContext *s);
#endif