summaryrefslogtreecommitdiff
path: root/libavcodec/arm/vp8dsp_neon.S
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2019-02-01 11:05:22 +0200
committerMartin Storsjö <martin@martin.st>2019-02-19 11:46:18 +0200
commitcef914e08310166112ac09567e66452a7679bfc8 (patch)
tree5e4481bc3c700c05aa8706271672cd34d7abadc1 /libavcodec/arm/vp8dsp_neon.S
parente39a9212ab37a55b346801c77487d8a47b6f9fe2 (diff)
arm: vp8: Optimize put_epel16_h6v6 with vp8_epel8_v6_y2
This makes it similar to put_epel16_v6, and gives a 10-25% speedup of this function. Before: Cortex A7 A8 A9 A53 A72 vp8_put_epel16_h6v6_neon: 3058.0 2218.5 2459.8 2183.0 1572.2 After: vp8_put_epel16_h6v6_neon: 2670.8 1934.2 2244.4 1729.4 1503.9 Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/arm/vp8dsp_neon.S')
-rw-r--r--libavcodec/arm/vp8dsp_neon.S41
1 files changed, 13 insertions, 28 deletions
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
index f43b4f7060..b707d19fed 100644
--- a/libavcodec/arm/vp8dsp_neon.S
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -773,23 +773,6 @@ endfunc
vqrshrun.s16 \d1, q14, #7
.endm
-.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
- vmovl.u8 q10, \s2
- vmovl.u8 q11, \s3
- vmovl.u8 q9, \s1
- vmovl.u8 q12, \s4
- vmovl.u8 q8, \s0
- vmovl.u8 q13, \s5
- vmul.u16 q10, q10, d0[2]
- vmul.u16 q11, q11, d0[3]
- vmls.u16 q10, q9, d0[1]
- vmls.u16 q11, q12, d1[0]
- vmla.u16 q10, q8, d0[0]
- vmla.u16 q11, q13, d1[1]
- vqadd.s16 q11, q10, q11
- vqrshrun.s16 \d0, q11, #7
-.endm
-
.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
vmovl.u8 q10, \s0
vmovl.u8 q11, \s3
@@ -909,12 +892,12 @@ function ff_put_vp8_epel16_h6v6_neon, export=1
sub r2, r2, r3, lsl #1
sub r2, r2, #2
push {r4,lr}
- vpush {d8-d9}
+ vpush {d8-d15}
@ first pass (horizontal):
- ldr r4, [sp, #28] @ mx
+ ldr r4, [sp, #64+8+4] @ mx
movrel lr, subpel_filters-16
- ldr r12, [sp, #24] @ h
+ ldr r12, [sp, #64+8+0] @ h
add r4, lr, r4, lsl #4
sub sp, sp, #336+16
vld1.16 {q0}, [r4,:128]
@@ -931,9 +914,9 @@ function ff_put_vp8_epel16_h6v6_neon, export=1
bne 1b
@ second pass (vertical):
- ldr r4, [sp, #336+16+32] @ my
+ ldr r4, [sp, #336+16+64+8+8] @ my
movrel lr, subpel_filters-16
- ldr r12, [sp, #336+16+24] @ h
+ ldr r12, [sp, #336+16+64+8+0] @ h
add r4, lr, r4, lsl #4
add lr, sp, #15
vld1.16 {q0}, [r4,:128]
@@ -941,18 +924,20 @@ function ff_put_vp8_epel16_h6v6_neon, export=1
2:
vld1.8 {d2-d5}, [lr,:128]!
vld1.8 {d6-d9}, [lr,:128]!
- vld1.8 {d28-d31},[lr,:128]
- sub lr, lr, #48
+ vld1.8 {d10-d13},[lr,:128]!
+ vld1.8 {d14-d15},[lr,:128]
+ sub lr, lr, #64
- vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
- vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
+ vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
+ vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
vst1.8 {d2-d3}, [r0,:128], r1
- subs r12, r12, #1
+ vst1.8 {d4-d5}, [r0,:128], r1
+ subs r12, r12, #2
bne 2b
add sp, sp, #336+16
- vpop {d8-d9}
+ vpop {d8-d15}
pop {r4,pc}
endfunc