From 62844c3fd66940c7747e9b2bb7804e265319f43f Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Mon, 18 Feb 2013 21:03:02 -0800 Subject: h264: Integrate clear_blocks calls with IDCT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The non-intra-pcm branch in hl_decode_mb (simple, 8bpp) goes from 700 to 672 cycles, and the complete loop of decode_mb_cabac and hl_decode_mb (in the decode_slice loop) goes from 1759 to 1733 cycles on the clip tested (cathedral), i.e. almost 30 cycles per mb faster. Signed-off-by: Martin Storsjö --- libavcodec/arm/h264idct_neon.S | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'libavcodec/arm/h264idct_neon.S') diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S index b23ddb1b73..3e5321cd05 100644 --- a/libavcodec/arm/h264idct_neon.S +++ b/libavcodec/arm/h264idct_neon.S @@ -22,9 +22,12 @@ function ff_h264_idct_add_neon, export=1 vld1.64 {d0-d3}, [r1,:128] + vmov.i16 q15, #0 vswp d1, d2 + vst1.16 {q15}, [r1,:128]! vadd.i16 d4, d0, d1 + vst1.16 {q15}, [r1,:128]! vshr.s16 q8, q1, #1 vsub.i16 d5, d0, d1 vadd.i16 d6, d2, d17 @@ -65,11 +68,14 @@ function ff_h264_idct_add_neon, export=1 vst1.32 {d0[1]}, [r0,:32], r2 vst1.32 {d1[0]}, [r0,:32], r2 + sub r1, r1, #32 bx lr endfunc function ff_h264_idct_dc_add_neon, export=1 + mov r3, #0 vld1.16 {d2[],d3[]}, [r1,:16] + strh r3, [r1] vrshr.s16 q1, q1, #6 vld1.32 {d0[0]}, [r0,:32], r2 vld1.32 {d0[1]}, [r0,:32], r2 @@ -148,7 +154,7 @@ function ff_h264_idct_add8_neon, export=1 add r5, r1, #16*4 add r1, r2, #16*32 mov r2, r3 - mov r3, r1 + mov r10, r1 ldr r6, [sp, #32] movrel r7, scan8+16 mov r12, #0 @@ -156,7 +162,7 @@ function ff_h264_idct_add8_neon, export=1 ldr r0, [r5, r12, lsl #2] ldrb r8, [r6, r8] add r0, r0, r4 - add r1, r3, r12, lsl #5 + add r1, r10, r12, lsl #5 cmp r8, #0 ldrsh r8, [r1] iteet ne @@ -180,7 +186,9 @@ endfunc qb .req q14 vshr.s16 q2, q10, #1 vadd.i16 q0, q8, q12 - vld1.16 {q14-q15},[r1,:128]! + vld1.16 {q14-q15},[r1,:128] + vst1.16 {q3}, [r1,:128]! + vst1.16 {q3}, [r1,:128]! vsub.i16 q1, q8, q12 vshr.s16 q3, q14, #1 vsub.i16 q2, q2, q14 @@ -259,9 +267,16 @@ endfunc .endm function ff_h264_idct8_add_neon, export=1 - vld1.16 {q8-q9}, [r1,:128]! - vld1.16 {q10-q11},[r1,:128]! - vld1.16 {q12-q13},[r1,:128]! + vmov.i16 q3, #0 + vld1.16 {q8-q9}, [r1,:128] + vst1.16 {q3}, [r1,:128]! + vst1.16 {q3}, [r1,:128]! + vld1.16 {q10-q11},[r1,:128] + vst1.16 {q3}, [r1,:128]! + vst1.16 {q3}, [r1,:128]! + vld1.16 {q12-q13},[r1,:128] + vst1.16 {q3}, [r1,:128]! + vst1.16 {q3}, [r1,:128]! idct8x8_cols 0 idct8x8_cols 1 @@ -313,7 +328,9 @@ function ff_h264_idct8_add_neon, export=1 endfunc function ff_h264_idct8_dc_add_neon, export=1 + mov r3, #0 vld1.16 {d30[],d31[]},[r1,:16] + strh r3, [r1] vld1.32 {d0}, [r0,:64], r2 vrshr.s16 q15, q15, #6 vld1.32 {d1}, [r0,:64], r2 -- cgit v1.2.3