summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libavcodec/arm/h264cmc_neon.S86
1 files changed, 28 insertions, 58 deletions
diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
index c7e54605bb..3427e36011 100644
--- a/libavcodec/arm/h264cmc_neon.S
+++ b/libavcodec/arm/h264cmc_neon.S
@@ -51,24 +51,20 @@ T cmp r7, #0
beq 2f
- add r5, r1, r2
-
vdup.8 d0, r4
- lsl r4, r2, #1
vdup.8 d1, r12
- vld1.8 {d4, d5}, [r1], r4
+ vld1.8 {d4, d5}, [r1], r2
vdup.8 d2, r6
- vld1.8 {d6, d7}, [r5], r4
vdup.8 d3, r7
-
vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
-1: pld [r5]
+1: vld1.8 {d6, d7}, [r1], r2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
- vld1.8 {d4, d5}, [r1], r4
+ vext.8 d7, d6, d7, #1
+ vld1.8 {d4, d5}, [r1], r2
vmlal.u8 q8, d6, d2
+ pld [r1]
vext.8 d5, d4, d5, #1
vmlal.u8 q8, d7, d3
vmull.u8 q9, d6, d0
@@ -76,8 +72,7 @@ T cmp r7, #0
vmlal.u8 q9, d7, d1
vmlal.u8 q9, d4, d2
vmlal.u8 q9, d5, d3
- vld1.8 {d6, d7}, [r5], r4
- pld [r1]
+ pld [r1, r2]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
@@ -92,7 +87,6 @@ T cmp r7, #0
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
- vext.8 d7, d6, d7, #1
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 1b
@@ -106,18 +100,15 @@ T cmp r7, #0
beq 4f
- add r5, r1, r2
- lsl r4, r2, #1
- vld1.8 {d4}, [r1], r4
- vld1.8 {d6}, [r5], r4
+ vld1.8 {d4}, [r1], r2
-3: pld [r5]
+3: vld1.8 {d6}, [r1], r2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d1
- vld1.8 {d4}, [r1], r4
+ vld1.8 {d4}, [r1], r2
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d1
- vld1.8 {d6}, [r5], r4
+ pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
@@ -127,13 +118,13 @@ T cmp r7, #0
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
+ pld [r1, r2]
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
subs r3, r3, #2
- pld [r1]
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 3b
@@ -144,16 +135,13 @@ T cmp r7, #0
vld1.8 {d6, d7}, [r1], r2
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
-
-5: pld [r1]
+ pld [r1]
subs r3, r3, #2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
- vld1.8 {d4, d5}, [r1], r2
vmull.u8 q9, d6, d0
vmlal.u8 q9, d7, d1
- pld [r1]
- vext.8 d5, d4, d5, #1
+ pld [r1, r2]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
@@ -168,11 +156,9 @@ T cmp r7, #0
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
- vld1.8 {d6, d7}, [r1], r2
- vext.8 d7, d6, d7, #1
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
- bgt 5b
+ bgt 4b
pop {r4-r7, pc}
endfunc
@@ -209,33 +195,29 @@ T cmp r7, #0
beq 2f
- add r5, r1, r2
-
vdup.8 d0, r4
- lsl r4, r2, #1
vdup.8 d1, r12
- vld1.8 {d4}, [r1], r4
+ vld1.8 {d4}, [r1], r2
vdup.8 d2, r6
- vld1.8 {d6}, [r5], r4
vdup.8 d3, r7
vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
vtrn.32 d4, d5
- vtrn.32 d6, d7
vtrn.32 d0, d1
vtrn.32 d2, d3
-1: pld [r5]
+1: vld1.8 {d6}, [r1], r2
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d2
- vld1.8 {d4}, [r1], r4
+ vld1.8 {d4}, [r1], r2
vext.8 d5, d4, d5, #1
vtrn.32 d4, d5
+ pld [r1]
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d2
- vld1.8 {d6}, [r5], r4
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
.ifc \codec,h264
@@ -245,14 +227,12 @@ T cmp r7, #0
vshrn.u16 d16, q8, #6
.endif
subs r3, r3, #2
- pld [r1]
+ pld [r1, r2]
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
- vext.8 d7, d6, d7, #1
- vtrn.32 d6, d7
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 1b
@@ -268,18 +248,15 @@ T cmp r7, #0
beq 4f
vext.32 d1, d0, d1, #1
- add r5, r1, r2
- lsl r4, r2, #1
- vld1.32 {d4[0]}, [r1], r4
- vld1.32 {d4[1]}, [r5], r4
+ vld1.32 {d4[0]}, [r1], r2
-3: pld [r5]
+3: vld1.32 {d4[1]}, [r1], r2
vmull.u8 q8, d4, d0
- vld1.32 {d4[0]}, [r1], r4
+ vld1.32 {d4[0]}, [r1], r2
vmull.u8 q9, d4, d1
- vld1.32 {d4[1]}, [r5], r4
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
+ pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
.else
@@ -292,7 +269,7 @@ T cmp r7, #0
vrhadd.u8 d16, d16, d20
.endif
subs r3, r3, #2
- pld [r1]
+ pld [r1, r2]
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 3b
@@ -305,13 +282,9 @@ T cmp r7, #0
vext.8 d7, d6, d7, #1
vtrn.32 d4, d5
vtrn.32 d6, d7
-
-5: vmull.u8 q8, d4, d0
+ vmull.u8 q8, d4, d0
vmull.u8 q9, d6, d0
subs r3, r3, #2
- vld1.8 {d4}, [r1], r2
- vext.8 d5, d4, d5, #1
- vtrn.32 d4, d5
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
pld [r1]
@@ -326,13 +299,10 @@ T cmp r7, #0
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
- vld1.8 {d6}, [r1], r2
- vext.8 d7, d6, d7, #1
- vtrn.32 d6, d7
pld [r1]
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
- bgt 5b
+ bgt 4b
pop {r4-r7, pc}
endfunc