summaryrefslogtreecommitdiff
path: root/libavcodec/aarch64/vp9lpf_neon.S
diff options
context:
space:
mode:
authorJanne Grunau <janne-libav@jannau.net>2017-01-10 00:15:07 +0200
committerMichael Niedermayer <michael@niedermayer.cc>2017-01-14 21:13:06 +0100
commit62ea07d797c503bc4b727e56d9c0f914a93c8ef6 (patch)
treecc117829b61ee24d877615cb41df2f7f72ccf856 /libavcodec/aarch64/vp9lpf_neon.S
parent3ac46a0a62386a52e38c066379ff36b5038dd4d0 (diff)
aarch64: vp9: use alternative returns in the core loop filter function
Since aarch64 has enough free general purpose registers use them to branch to the appropiate storage code. 1-2 cycles faster for the functions using loop_filter 8/16, ... on a cortex-a53. Mixed results (up to 2 cycles faster/slower) on a cortex-a57. This is cherrypicked from libav commit d7595de0b25e7064fd9e06dea5d0425536cef6dc. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libavcodec/aarch64/vp9lpf_neon.S')
-rw-r--r--libavcodec/aarch64/vp9lpf_neon.S48
1 files changed, 18 insertions, 30 deletions
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index e727a4d0de..78aae61e87 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -410,15 +410,19 @@
.endif
// If no pixels needed flat8in nor flat8out, jump to a
// writeout of the inner 4 pixels
- cbz x5, 7f
+ cbnz x5, 1f
+ br x14
+1:
mov x5, v7.d[0]
.ifc \sz, .16b
mov x6, v7.d[1]
orr x5, x5, x6
.endif
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels
- cbz x5, 8f
+ cbnz x5, 1f
+ br x15
+1:
// flat8out
// This writes all outputs into v2-v17 (skipping v6 and v16).
// If this part is skipped, the output is read from v21-v26 (which is the input
@@ -549,35 +553,24 @@ endfunc
function vp9_loop_filter_8
loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
- mov x5, #0
ret
6:
- mov x5, #6
- ret
+ br x13
9:
br x10
endfunc
function vp9_loop_filter_8_16b_mix
loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31
- mov x5, #0
ret
6:
- mov x5, #6
- ret
+ br x13
9:
br x10
endfunc
function vp9_loop_filter_16
loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15
- mov x5, #0
- ret
-7:
- mov x5, #7
- ret
-8:
- mov x5, #8
ret
9:
ldp d8, d9, [sp], 0x10
@@ -589,13 +582,6 @@ endfunc
function vp9_loop_filter_16_16b
loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15
- mov x5, #0
- ret
-7:
- mov x5, #7
- ret
-8:
- mov x5, #8
ret
9:
ldp d8, d9, [sp], 0x10
@@ -614,11 +600,14 @@ endfunc
.endm
.macro loop_filter_8
+ // calculate alternative 'return' targets
+ adr x13, 6f
bl vp9_loop_filter_8
- cbnz x5, 6f
.endm
.macro loop_filter_8_16b_mix mix
+ // calculate alternative 'return' targets
+ adr x13, 6f
.if \mix == 48
mov x11, #0xffffffff00000000
.elseif \mix == 84
@@ -627,21 +616,20 @@ endfunc
mov x11, #0xffffffffffffffff
.endif
bl vp9_loop_filter_8_16b_mix
- cbnz x5, 6f
.endm
.macro loop_filter_16
+ // calculate alternative 'return' targets
+ adr x14, 7f
+ adr x15, 8f
bl vp9_loop_filter_16
- cmp x5, 7
- b.gt 8f
- b.eq 7f
.endm
.macro loop_filter_16_16b
+ // calculate alternative 'return' targets
+ adr x14, 7f
+ adr x15, 8f
bl vp9_loop_filter_16_16b
- cmp x5, 7
- b.gt 8f
- b.eq 7f
.endm