summaryrefslogtreecommitdiff
path: root/libavcodec/arm/h264idct_neon.S
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2013-02-18 21:03:02 -0800
committerMartin Storsjö <martin@martin.st>2013-04-10 11:03:06 +0300
commit62844c3fd66940c7747e9b2bb7804e265319f43f (patch)
treeb0e5a05644457aa5d7598d1fefa1a41f83550753 /libavcodec/arm/h264idct_neon.S
parente8cafd2773bc56455c8816593cbd9368f2d69a80 (diff)
h264: Integrate clear_blocks calls with IDCT
The non-intra-pcm branch in hl_decode_mb (simple, 8bpp) goes from 700 to 672 cycles, and the complete loop of decode_mb_cabac and hl_decode_mb (in the decode_slice loop) goes from 1759 to 1733 cycles on the clip tested (cathedral), i.e. almost 30 cycles per mb faster. Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/arm/h264idct_neon.S')
-rw-r--r--libavcodec/arm/h264idct_neon.S29
1 files changed, 23 insertions, 6 deletions
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index b23ddb1b73..3e5321cd05 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -22,9 +22,12 @@
function ff_h264_idct_add_neon, export=1
vld1.64 {d0-d3}, [r1,:128]
+ vmov.i16 q15, #0
vswp d1, d2
+ vst1.16 {q15}, [r1,:128]!
vadd.i16 d4, d0, d1
+ vst1.16 {q15}, [r1,:128]!
vshr.s16 q8, q1, #1
vsub.i16 d5, d0, d1
vadd.i16 d6, d2, d17
@@ -65,11 +68,14 @@ function ff_h264_idct_add_neon, export=1
vst1.32 {d0[1]}, [r0,:32], r2
vst1.32 {d1[0]}, [r0,:32], r2
+ sub r1, r1, #32
bx lr
endfunc
function ff_h264_idct_dc_add_neon, export=1
+ mov r3, #0
vld1.16 {d2[],d3[]}, [r1,:16]
+ strh r3, [r1]
vrshr.s16 q1, q1, #6
vld1.32 {d0[0]}, [r0,:32], r2
vld1.32 {d0[1]}, [r0,:32], r2
@@ -148,7 +154,7 @@ function ff_h264_idct_add8_neon, export=1
add r5, r1, #16*4
add r1, r2, #16*32
mov r2, r3
- mov r3, r1
+ mov r10, r1
ldr r6, [sp, #32]
movrel r7, scan8+16
mov r12, #0
@@ -156,7 +162,7 @@ function ff_h264_idct_add8_neon, export=1
ldr r0, [r5, r12, lsl #2]
ldrb r8, [r6, r8]
add r0, r0, r4
- add r1, r3, r12, lsl #5
+ add r1, r10, r12, lsl #5
cmp r8, #0
ldrsh r8, [r1]
iteet ne
@@ -180,7 +186,9 @@ endfunc
qb .req q14
vshr.s16 q2, q10, #1
vadd.i16 q0, q8, q12
- vld1.16 {q14-q15},[r1,:128]!
+ vld1.16 {q14-q15},[r1,:128]
+ vst1.16 {q3}, [r1,:128]!
+ vst1.16 {q3}, [r1,:128]!
vsub.i16 q1, q8, q12
vshr.s16 q3, q14, #1
vsub.i16 q2, q2, q14
@@ -259,9 +267,16 @@ endfunc
.endm
function ff_h264_idct8_add_neon, export=1
- vld1.16 {q8-q9}, [r1,:128]!
- vld1.16 {q10-q11},[r1,:128]!
- vld1.16 {q12-q13},[r1,:128]!
+ vmov.i16 q3, #0
+ vld1.16 {q8-q9}, [r1,:128]
+ vst1.16 {q3}, [r1,:128]!
+ vst1.16 {q3}, [r1,:128]!
+ vld1.16 {q10-q11},[r1,:128]
+ vst1.16 {q3}, [r1,:128]!
+ vst1.16 {q3}, [r1,:128]!
+ vld1.16 {q12-q13},[r1,:128]
+ vst1.16 {q3}, [r1,:128]!
+ vst1.16 {q3}, [r1,:128]!
idct8x8_cols 0
idct8x8_cols 1
@@ -313,7 +328,9 @@ function ff_h264_idct8_add_neon, export=1
endfunc
function ff_h264_idct8_dc_add_neon, export=1
+ mov r3, #0
vld1.16 {d30[],d31[]},[r1,:16]
+ strh r3, [r1]
vld1.32 {d0}, [r0,:64], r2
vrshr.s16 q15, q15, #6
vld1.32 {d1}, [r0,:64], r2