summaryrefslogtreecommitdiff
path: root/libavcodec/arm/dcadsp_vfp.S
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2013-07-19 11:03:32 +0300
committerMartin Storsjö <martin@martin.st>2013-07-22 10:15:39 +0300
commitba6836c966debc56314ce2ef133c7f0c1fdfdeac (patch)
tree6d2414eec296337d4f89e5575b8082b005684ed3 /libavcodec/arm/dcadsp_vfp.S
parentb63bb251ea6d6ba23295294e37a92625c0192206 (diff)
arm: Add VFP-accelerated version of dca_lfe_fir
Before After Mean StdDev Mean StdDev Change This function 868.2 33.5 436.0 27.0 +99.1% Overall 15973.0 223.2 15577.5 83.2 +2.5% Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/arm/dcadsp_vfp.S')
-rw-r--r--libavcodec/arm/dcadsp_vfp.S220
1 files changed, 220 insertions, 0 deletions
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
new file mode 100644
index 0000000000..57e16196f7
--- /dev/null
+++ b/libavcodec/arm/dcadsp_vfp.S
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+POUT .req a1
+PIN .req a2
+PCOEF .req a3
+DECIFACTOR .req a4
+OLDFPSCR .req a4
+COUNTER .req ip
+
+SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
+SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
+IN0 .req s4
+IN1 .req s5
+IN2 .req s6
+IN3 .req s7
+IN4 .req s0
+IN5 .req s1
+IN6 .req s2
+IN7 .req s3
+COEF0 .req s8 @ coefficient elements
+COEF1 .req s9
+COEF2 .req s10
+COEF3 .req s11
+COEF4 .req s12
+COEF5 .req s13
+COEF6 .req s14
+COEF7 .req s15
+ACCUM0 .req s16 @ double-buffered multiply-accumulate results
+ACCUM4 .req s20
+POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
+POST1 .req s25
+POST2 .req s26
+POST3 .req s27
+
+
+.macro inner_loop decifactor, dir, tail, head
+ .ifc "\dir","up"
+ .set X, 0
+ .set Y, 4
+ .else
+ .set X, 4*JMAX*4 - 4
+ .set Y, -4
+ .endif
+ .ifnc "\head",""
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
+ .endif
+ .ifnc "\tail",""
+ vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
+ .endif
+ .ifnc "\head",""
+ vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
+ .endif
+ .ifnc "\tail",""
+ vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
+ .endif
+ .ifnc "\head",""
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
+ .ifc "\tail",""
+ vmul.f ACCUM4, COEF4, IN1 @ vector operation
+ .endif
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
+ .ifnc "\tail",""
+ vmul.f ACCUM4, COEF4, IN1 @ vector operation
+ .endif
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
+ .endif
+ .ifnc "\tail",""
+ vstmia POUT!, {POST0-POST3}
+ .endif
+ .ifnc "\head",""
+ vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
+ vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
+ .if \decifactor == 32
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
+ vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
+ vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
+ vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
+ vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
+ .endif
+ .endif
+.endm
+
+.macro dca_lfe_fir decifactor
+ .if \decifactor == 32
+ .set JMAX, 8
+ vpush {s16-s31}
+ vmov SCALE32, s0 @ duplicate scalar across vector
+ vldr IN4, [PIN, #-4*4]
+ vldr IN5, [PIN, #-5*4]
+ vldr IN6, [PIN, #-6*4]
+ vldr IN7, [PIN, #-7*4]
+ .else
+ .set JMAX, 4
+ vpush {s16-s27}
+ .endif
+
+ mov COUNTER, #\decifactor/4 - 1
+ inner_loop \decifactor, up,, head
+1: add PCOEF, PCOEF, #4*JMAX*4
+ subs COUNTER, COUNTER, #1
+ inner_loop \decifactor, up, tail, head
+ bne 1b
+ inner_loop \decifactor, up, tail
+
+ mov COUNTER, #\decifactor/4 - 1
+ inner_loop \decifactor, down,, head
+1: sub PCOEF, PCOEF, #4*JMAX*4
+ subs COUNTER, COUNTER, #1
+ inner_loop \decifactor, down, tail, head
+ bne 1b
+ inner_loop \decifactor, down, tail
+
+ .if \decifactor == 32
+ vpop {s16-s31}
+ .else
+ vpop {s16-s27}
+ .endif
+ fmxr FPSCR, OLDFPSCR
+ bx lr
+.endm
+
+
+/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
+ * int decifactor, float scale)
+ */
+function ff_dca_lfe_fir_vfp, export=1
+ teq DECIFACTOR, #32
+ fmrx OLDFPSCR, FPSCR
+ ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, ip
+NOVFP vldr s0, [sp]
+ vldr IN0, [PIN, #-0*4]
+ vldr IN1, [PIN, #-1*4]
+ vldr IN2, [PIN, #-2*4]
+ vldr IN3, [PIN, #-3*4]
+ beq 32f
+64: dca_lfe_fir 64
+ .ltorg
+32: dca_lfe_fir 32
+endfunc
+
+ .unreq POUT
+ .unreq PIN
+ .unreq PCOEF
+ .unreq DECIFACTOR
+ .unreq OLDFPSCR
+ .unreq COUNTER
+
+ .unreq SCALE32
+ .unreq SCALE64
+ .unreq IN0
+ .unreq IN1
+ .unreq IN2
+ .unreq IN3
+ .unreq IN4
+ .unreq IN5
+ .unreq IN6
+ .unreq IN7
+ .unreq COEF0
+ .unreq COEF1
+ .unreq COEF2
+ .unreq COEF3
+ .unreq COEF4
+ .unreq COEF5
+ .unreq COEF6
+ .unreq COEF7
+ .unreq ACCUM0
+ .unreq ACCUM4
+ .unreq POST0
+ .unreq POST1
+ .unreq POST2
+ .unreq POST3