From 8986fddc2bab92bd7d77a123ac70c4fb70c96c7c Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Tue, 14 Jun 2011 11:29:48 +0100 Subject: ARM: allow building in Thumb2 mode Signed-off-by: Mans Rullgard --- libavcodec/arm/aac.h | 3 + libavcodec/arm/ac3dsp_arm.S | 1 + libavcodec/arm/ac3dsp_armv6.S | 2 + libavcodec/arm/ac3dsp_neon.S | 1 + libavcodec/arm/asm.S | 93 +++++++++++++++++++++++++++++ libavcodec/arm/dcadsp_neon.S | 1 + libavcodec/arm/dsputil_arm.S | 10 ++++ libavcodec/arm/dsputil_armv6.S | 76 ++++++++++++------------ libavcodec/arm/dsputil_neon.S | 3 + libavcodec/arm/dsputil_vfp.S | 21 +++++++ libavcodec/arm/fmtconvert_neon.S | 4 ++ libavcodec/arm/fmtconvert_vfp.S | 3 + libavcodec/arm/h264dsp_neon.S | 98 +++++++++++++++++-------------- libavcodec/arm/h264idct_neon.S | 23 +++++--- libavcodec/arm/mathops.h | 3 + libavcodec/arm/mdct_neon.S | 4 +- libavcodec/arm/mpegaudiodsp_fixed_armv6.S | 6 +- libavcodec/arm/mpegvideo_armv5te_s.S | 12 ++++ libavcodec/arm/mpegvideo_neon.S | 4 ++ libavcodec/arm/rdft_neon.S | 1 + libavcodec/arm/simple_idct_arm.S | 32 ++++++++-- libavcodec/arm/simple_idct_armv5te.S | 39 ++++++++---- libavcodec/arm/simple_idct_armv6.S | 33 ++++++----- libavcodec/arm/simple_idct_neon.S | 6 +- libavcodec/arm/synth_filter_neon.S | 2 + libavcodec/arm/vp56_arith.h | 27 +++++++-- libavcodec/arm/vp8_armv6.S | 36 +++++++++++- libavcodec/arm/vp8dsp_neon.S | 16 ++--- 28 files changed, 415 insertions(+), 145 deletions(-) (limited to 'libavcodec/arm') diff --git a/libavcodec/arm/aac.h b/libavcodec/arm/aac.h index 6d5df4933a..83b5aef1b6 100644 --- a/libavcodec/arm/aac.h +++ b/libavcodec/arm/aac.h @@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx, "vmov d1, %2, %3 \n\t" "lsls %6, %6, #1 \n\t" "and %0, %5, #1<<31 \n\t" + "it cs \n\t" "lslcs %5, %5, #1 \n\t" "lsls %6, %6, #1 \n\t" "and %1, %5, #1<<31 \n\t" + "it cs \n\t" "lslcs %5, %5, #1 \n\t" "lsls %6, %6, #1 \n\t" "and %2, %5, #1<<31 \n\t" + "it cs \n\t" "lslcs %5, %5, #1 \n\t" "vmov d4, %0, %1 \n\t" "and %3, %5, #1<<31 \n\t" diff --git a/libavcodec/arm/ac3dsp_arm.S b/libavcodec/arm/ac3dsp_arm.S index 545714cff1..9a7d20eb7b 100644 --- a/libavcodec/arm/ac3dsp_arm.S +++ b/libavcodec/arm/ac3dsp_arm.S @@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1 lsl r3, lr, #1 ldrh r12, [r0, r3] subs r2, r2, #1 + it gt ldrbgt lr, [r1], #1 add r12, r12, #1 strh r12, [r0, r3] diff --git a/libavcodec/arm/ac3dsp_armv6.S b/libavcodec/arm/ac3dsp_armv6.S index 8026cb72e6..d3058ffcec 100644 --- a/libavcodec/arm/ac3dsp_armv6.S +++ b/libavcodec/arm/ac3dsp_armv6.S @@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1 mov r11, r10 ldrb r10, [r4], #1 @ band_start_tab[band++] subs r9, r9, r5 @ - floor + it lt movlt r9, #0 cmp r10, r3 @ - end and r9, r9, r8 @ & 0x1fe0 + ite gt subgt r8, r3, r11 suble r8, r10, r11 add r9, r9, r5 @ + floor => m diff --git a/libavcodec/arm/ac3dsp_neon.S b/libavcodec/arm/ac3dsp_neon.S index 946b39f25b..fdf1deabc9 100644 --- a/libavcodec/arm/ac3dsp_neon.S +++ b/libavcodec/arm/ac3dsp_neon.S @@ -41,6 +41,7 @@ endfunc function ff_ac3_exponent_min_neon, export=1 cmp r1, #0 + it eq bxeq lr push {lr} mov r12, #256 diff --git a/libavcodec/arm/asm.S b/libavcodec/arm/asm.S index 8d7fe98614..a7d3ace208 100644 --- a/libavcodec/arm/asm.S +++ b/libavcodec/arm/asm.S @@ -24,9 +24,18 @@ # define ELF #else # define ELF @ +#endif + +#if CONFIG_THUMB +# define A @ +# define T +#else +# define A +# define T @ #endif .syntax unified +T .thumb .macro require8 val=1 ELF .eabi_attribute 24, \val @@ -82,6 +91,90 @@ ELF .size \name, . - \name #endif .endm +.macro ldr_pre rt, rn, rm:vararg +A ldr \rt, [\rn, \rm]! +T add \rn, \rn, \rm +T ldr \rt, [\rn] +.endm + +.macro ldr_post rt, rn, rm:vararg +A ldr \rt, [\rn], \rm +T ldr \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro ldrd_reg rt, rt2, rn, rm +A ldrd \rt, \rt2, [\rn, \rm] +T add \rt, \rn, \rm +T ldrd \rt, \rt2, [\rt] +.endm + +.macro ldrd_post rt, rt2, rn, rm +A ldrd \rt, \rt2, [\rn], \rm +T ldrd \rt, \rt2, [\rn] +T add \rn, \rn, \rm +.endm + +.macro ldrh_pre rt, rn, rm +A ldrh \rt, [\rn, \rm]! +T add \rn, \rn, \rm +T ldrh \rt, [\rn] +.endm + +.macro ldrh_dpre rt, rn, rm +A ldrh \rt, [\rn, -\rm]! +T sub \rn, \rn, \rm +T ldrh \rt, [\rn] +.endm + +.macro ldrh_post rt, rn, rm +A ldrh \rt, [\rn], \rm +T ldrh \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro str_post rt, rn, rm:vararg +A str \rt, [\rn], \rm +T str \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strb_post rt, rn, rm:vararg +A strb \rt, [\rn], \rm +T strb \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strd_post rt, rt2, rn, rm +A strd \rt, \rt2, [\rn], \rm +T strd \rt, \rt2, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strh_pre rt, rn, rm +A strh \rt, [\rn, \rm]! +T add \rn, \rn, \rm +T strh \rt, [\rn] +.endm + +.macro strh_dpre rt, rn, rm +A strh \rt, [\rn, -\rm]! +T sub \rn, \rn, \rm +T strh \rt, [\rn] +.endm + +.macro strh_post rt, rn, rm +A strh \rt, [\rn], \rm +T strh \rt, [\rn] +T add \rn, \rn, \rm +.endm + +.macro strh_dpost rt, rn, rm +A strh \rt, [\rn], -\rm +T strh \rt, [\rn] +T sub \rn, \rn, \rm +.endm + #if HAVE_VFP_ARGS .eabi_attribute 28, 1 # define VFP diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S index 61208361df..71f5dd843b 100644 --- a/libavcodec/arm/dcadsp_neon.S +++ b/libavcodec/arm/dcadsp_neon.S @@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1 add r5, r2, #256*4-16 @ cf1 sub r1, r1, #12 cmp r3, #32 + ite eq moveq r6, #256/32 movne r6, #256/64 NOVFP vldr s0, [sp, #16] @ scale diff --git a/libavcodec/arm/dsputil_arm.S b/libavcodec/arm/dsputil_arm.S index 21176283c4..eb20ad6d9e 100644 --- a/libavcodec/arm/dsputil_arm.S +++ b/libavcodec/arm/dsputil_arm.S @@ -554,10 +554,12 @@ endfunc and r9, r5, r14 and r10, r6, r14 and r11, r7, r14 + it eq andeq r14, r14, r14, \rnd #1 add r8, r8, r10 add r9, r9, r11 ldr r12, =0xfcfcfcfc >> 2 + itt eq addeq r8, r8, r14 addeq r9, r9, r14 and r4, r12, r4, lsr #2 @@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1 mvn r5, r5 mvn r7, r7 tst r6, #0x100 + it ne movne r6, r5, lsr #24 tst r8, #0x100 + it ne movne r8, r7, lsr #24 mov r9, r6 ldrsh r5, [r0, #4] /* moved form [A] */ @@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1 mvn r5, r5 mvn r7, r7 tst r6, #0x100 + it ne movne r6, r5, lsr #24 tst r8, #0x100 + it ne movne r8, r7, lsr #24 orr r9, r9, r6, lsl #16 ldr r4, [r1, #4] /* moved form [B] */ @@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1 mvn r5, r5 mvn r7, r7 tst r6, #0x100 + it ne movne r6, r5, lsr #24 tst r8, #0x100 + it ne movne r8, r7, lsr #24 mov r9, r6 ldrsh r5, [r0, #12] /* moved from [D] */ @@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1 mvn r5, r5 mvn r7, r7 tst r6, #0x100 + it ne movne r6, r5, lsr #24 tst r8, #0x100 + it ne movne r8, r7, lsr #24 orr r9, r9, r6, lsl #16 add r0, r0, #16 /* moved from [E] */ diff --git a/libavcodec/arm/dsputil_armv6.S b/libavcodec/arm/dsputil_armv6.S index 8acb96de2f..b8461059d9 100644 --- a/libavcodec/arm/dsputil_armv6.S +++ b/libavcodec/arm/dsputil_armv6.S @@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1 ldr r5, [r1, #4] ldr r6, [r1, #8] ldr r7, [r1, #12] - ldr r4, [r1], r2 + ldr_post r4, r1, r2 strd r6, r7, [r0, #8] ldr r9, [r1, #4] - strd r4, r5, [r0], r2 + strd_post r4, r5, r0, r2 ldr r10, [r1, #8] ldr r11, [r1, #12] - ldr r8, [r1], r2 + ldr_post r8, r1, r2 strd r10, r11, [r0, #8] subs r3, r3, #2 - strd r8, r9, [r0], r2 + strd_post r8, r9, r0, r2 bne 1b pop {r4-r11} @@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1 push {r4-r7} 1: ldr r5, [r1, #4] - ldr r4, [r1], r2 + ldr_post r4, r1, r2 ldr r7, [r1, #4] - strd r4, r5, [r0], r2 - ldr r6, [r1], r2 + strd_post r4, r5, r0, r2 + ldr_post r6, r1, r2 subs r3, r3, #2 - strd r6, r7, [r0], r2 + strd_post r6, r7, r0, r2 bne 1b pop {r4-r7} @@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1 ldr r5, [r1, #4] ldr r7, [r1, #5] lsr r6, r4, #8 - ldr r8, [r1, r2]! + ldr_pre r8, r1, r2 orr r6, r6, r5, lsl #24 ldr r9, [r1, #4] ldr r11, [r1, #5] @@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1 uhadd8 r9, r9, r11 and r6, r6, r12 uadd8 r8, r8, r14 - strd r4, r5, [r0], r2 + strd_post r4, r5, r0, r2 uadd8 r9, r9, r6 - strd r8, r9, [r0], r2 + strd_post r8, r9, r0, r2 bne 1b pop {r4-r11, pc} @@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1 orr r12, r12, r12, lsl #16 ldr r4, [r1] ldr r5, [r1, #4] - ldr r6, [r1, r2]! + ldr_pre r6, r1, r2 ldr r7, [r1, #4] 1: subs r3, r3, #2 @@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1 uhadd8 r9, r5, r7 eor r11, r5, r7 and r10, r10, r12 - ldr r4, [r1, r2]! + ldr_pre r4, r1, r2 uadd8 r8, r8, r10 and r11, r11, r12 uadd8 r9, r9, r11 @@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1 eor r7, r5, r7 uadd8 r10, r10, r6 and r7, r7, r12 - ldr r6, [r1, r2]! + ldr_pre r6, r1, r2 uadd8 r11, r11, r7 - strd r8, r9, [r0], r2 + strd_post r8, r9, r0, r2 ldr r7, [r1, #4] - strd r10, r11, [r0], r2 + strd_post r10, r11, r0, r2 bne 1b pop {r4-r11} @@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1 ldr r4, [r1] ldr r5, [r1, #4] ldr r7, [r1, #5] - ldr r8, [r1, r2]! + ldr_pre r8, r1, r2 ldr r9, [r1, #4] ldr r14, [r1, #5] add r1, r1, r2 @@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1 push {r4-r9, lr} ldr r4, [r1] ldr r5, [r1, #4] - ldr r6, [r1, r2]! + ldr_pre r6, r1, r2 ldr r7, [r1, #4] 1: subs r3, r3, #2 uhadd8 r8, r4, r6 - ldr r4, [r1, r2]! + ldr_pre r4, r1, r2 uhadd8 r9, r5, r7 ldr r5, [r1, #4] uhadd8 r12, r4, r6 - ldr r6, [r1, r2]! + ldr_pre r6, r1, r2 uhadd8 r14, r5, r7 ldr r7, [r1, #4] stm r0, {r8,r9} @@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1 orr lr, lr, lr, lsl #16 ldrd r4, r5, [r0] ldr r10, [r1, #4] - ldr r9, [r1], r2 + ldr_post r9, r1, r2 subs r3, r3, #2 1: pld [r1, r2] eor r8, r4, r9 uhadd8 r4, r4, r9 eor r12, r5, r10 - ldrd r6, r7, [r0, r2] + ldrd_reg r6, r7, r0, r2 uhadd8 r5, r5, r10 and r8, r8, lr ldr r10, [r1, #4] and r12, r12, lr uadd8 r4, r4, r8 - ldr r9, [r1], r2 + ldr_post r9, r1, r2 eor r8, r6, r9 uadd8 r5, r5, r12 pld [r1, r2, lsl #1] eor r12, r7, r10 uhadd8 r6, r6, r9 - strd r4, r5, [r0], r2 + strd_post r4, r5, r0, r2 uhadd8 r7, r7, r10 beq 2f and r8, r8, lr - ldrd r4, r5, [r0, r2] + ldrd_reg r4, r5, r0, r2 uadd8 r6, r6, r8 ldr r10, [r1, #4] and r12, r12, lr subs r3, r3, #2 uadd8 r7, r7, r12 - ldr r9, [r1], r2 - strd r6, r7, [r0], r2 + ldr_post r9, r1, r2 + strd_post r6, r7, r0, r2 b 1b 2: and r8, r8, lr and r12, r12, lr uadd8 r6, r6, r8 uadd8 r7, r7, r12 - strd r6, r7, [r0], r2 + strd_post r6, r7, r0, r2 pop {r4-r10, pc} endfunc @@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1 orr r6, r8, r5, lsl #8 orr r7, r4, lr, lsl #8 subs r3, r3, #1 - strd r6, r7, [r1], r2 + strd_post r6, r7, r1, r2 bgt 1b pop {r4-r8,pc} endfunc @@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1 push {r4-r8, lr} mov lr, #8 1: - ldrd r4, r5, [r1], r2 + ldrd_post r4, r5, r1, r2 subs lr, lr, #1 uxtb16 r6, r4 uxtb16 r4, r4, ror #8 @@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1 push {r4-r9, lr} mov lr, #8 1: - ldrd r4, r5, [r1], r3 - ldrd r6, r7, [r2], r3 + ldrd_post r4, r5, r1, r3 + ldrd_post r6, r7, r2, r3 uxtb16 r8, r4 uxtb16 r4, r4, ror #8 uxtb16 r9, r6 @@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1 push {r4-r9, lr} mov r0, #0 mov lr, #0 - ldrd r4, r5, [r1], r3 + ldrd_post r4, r5, r1, r3 1: subs r12, r12, #2 ldr r7, [r2, #4] - ldr r6, [r2], r3 - ldrd r8, r9, [r1], r3 + ldr_post r6, r2, r3 + ldrd_post r8, r9, r1, r3 usada8 r0, r4, r6, r0 pld [r2, r3] usada8 lr, r5, r7, lr ldr r7, [r2, #4] - ldr r6, [r2], r3 + ldr_post r6, r2, r3 beq 2f - ldrd r4, r5, [r1], r3 + ldrd_post r4, r5, r1, r3 usada8 r0, r8, r6, r0 pld [r2, r3] usada8 lr, r9, r7, lr @@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1 ldr r7, [r0, #12] usada8 r2, r6, lr, r2 beq 2f - ldr r4, [r0, r1]! + ldr_pre r4, r0, r1 usada8 r3, r7, lr, r3 bgt 1b 2: diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S index 5b80e40d6f..5e3bf27f67 100644 --- a/libavcodec/arm/dsputil_neon.S +++ b/libavcodec/arm/dsputil_neon.S @@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1 2: vst1.32 {d2-d3}, [r3, :128]! vst1.32 {d0-d1}, [r12,:128]! + it lt bxlt lr 3: vld1.32 {d2-d3}, [r1,:128] @@ -575,6 +576,7 @@ NOVFP vdup.32 q8, r2 2: vst1.32 {q2},[r0,:128]! vst1.32 {q3},[r0,:128]! ands len, len, #15 + it eq bxeq lr 3: vld1.32 {q0},[r1,:128]! vmul.f32 q0, q0, q8 @@ -638,6 +640,7 @@ NOVFP ldr r3, [sp] 2: vst1.32 {q8},[r0,:128]! vst1.32 {q9},[r0,:128]! ands r3, r3, #7 + it eq popeq {pc} 3: vld1.32 {q0},[r1,:128]! ldr r12, [r2], #4 diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S index 16ea25afe6..cbc4bd6c70 100644 --- a/libavcodec/arm/dsputil_vfp.S +++ b/libavcodec/arm/dsputil_vfp.S @@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1 1: subs r3, r3, #16 vmul.f32 s12, s4, s12 + itttt ge vldmiage r1!, {s16-s19} vldmiage r2!, {s24-s27} vldmiage r1!, {s20-s23} vldmiage r2!, {s28-s31} + it ge vmulge.f32 s24, s16, s24 vstmia r0!, {s8-s11} vstmia r0!, {s12-s15} + it ge vmulge.f32 s28, s20, s28 + itttt gt vldmiagt r1!, {s0-s3} vldmiagt r2!, {s8-s11} vldmiagt r1!, {s4-s7} vldmiagt r2!, {s12-s15} + ittt ge vmulge.f32 s8, s0, s8 vstmiage r0!, {s24-s27} vstmiage r0!, {s28-s31} @@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1 vmul.f32 s11, s0, s11 1: subs r3, r3, #16 + it ge vldmdbge r2!, {s16-s19} vmul.f32 s12, s7, s12 + it ge vldmiage r1!, {s24-s27} vmul.f32 s13, s6, s13 + it ge vldmdbge r2!, {s20-s23} vmul.f32 s14, s5, s14 + it ge vldmiage r1!, {s28-s31} vmul.f32 s15, s4, s15 + it ge vmulge.f32 s24, s19, s24 + it gt vldmdbgt r2!, {s0-s3} + it ge vmulge.f32 s25, s18, s25 vstmia r0!, {s8-s13} + it ge vmulge.f32 s26, s17, s26 + it gt vldmiagt r1!, {s8-s11} + itt ge vmulge.f32 s27, s16, s27 vmulge.f32 s28, s23, s28 + it gt vldmdbgt r2!, {s4-s7} + it ge vmulge.f32 s29, s22, s29 vstmia r0!, {s14-s15} + ittt ge vmulge.f32 s30, s21, s30 vmulge.f32 s31, s20, s31 vmulge.f32 s8, s3, s8 + it gt vldmiagt r1!, {s12-s15} + itttt ge vmulge.f32 s9, s2, s9 vmulge.f32 s10, s1, s10 vstmiage r0!, {s24-s27} vmulge.f32 s11, s0, s11 + it ge vstmiage r0!, {s28-s31} bgt 1b diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S index 4b0e9a2d6a..45cc84b7ec 100644 --- a/libavcodec/arm/fmtconvert_neon.S +++ b/libavcodec/arm/fmtconvert_neon.S @@ -71,6 +71,7 @@ endfunc function ff_float_to_int16_interleave_neon, export=1 cmp r3, #2 + itt lt ldrlt r1, [r1] blt ff_float_to_int16_neon bne 4f @@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1 vst1.64 {d3}, [r8], ip vst1.64 {d7}, [r8], ip subs r3, r3, #4 + it eq popeq {r4-r8,pc} cmp r3, #4 add r0, r0, #8 @@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1 vst1.32 {d23[1]}, [r8], ip 8: subs r3, r3, #2 add r0, r0, #4 + it eq popeq {r4-r8,pc} @ 1 channel @@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1 vst1.16 {d2[3]}, [r5,:16], ip vst1.16 {d3[1]}, [r5,:16], ip vst1.16 {d3[3]}, [r5,:16], ip + it eq popeq {r4-r8,pc} vld1.64 {d0-d1}, [r4,:128]! vcvt.s32.f32 q0, q0, #16 diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S index 1bb7f49801..f7b0e3dcb5 100644 --- a/libavcodec/arm/fmtconvert_vfp.S +++ b/libavcodec/arm/fmtconvert_vfp.S @@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1 vmov r5, r6, s2, s3 vmov r7, r8, s4, s5 vmov ip, lr, s6, s7 + it gt vldmiagt r1!, {s16-s23} ssat r4, #16, r4 ssat r3, #16, r3 @@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1 ssat r5, #16, r5 pkhbt r3, r3, r4, lsl #16 pkhbt r4, r5, r6, lsl #16 + itttt gt vcvtgt.s32.f32 s0, s16 vcvtgt.s32.f32 s1, s17 vcvtgt.s32.f32 s2, s18 vcvtgt.s32.f32 s3, s19 + itttt gt vcvtgt.s32.f32 s4, s20 vcvtgt.s32.f32 s5, s21 vcvtgt.s32.f32 s6, s22 diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S index b76e4479b5..0fa4a6b0a5 100644 --- a/libavcodec/arm/h264dsp_neon.S +++ b/libavcodec/arm/h264dsp_neon.S @@ -71,7 +71,9 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1 pld [r1] pld [r1, r2] - muls r7, r4, r5 +A muls r7, r4, r5 +T mul r7, r4, r5 +T cmp r7, #0 rsb r6, r7, r5, lsl #3 rsb ip, r7, r4, lsl #3 sub r4, r7, r4, lsl #3 @@ -197,7 +199,9 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1 pld [r1] pld [r1, r2] - muls r7, r4, r5 +A muls r7, r4, r5 +T mul r7, r4, r5 +T cmp r7, #0 rsb r6, r7, r5, lsl #3 rsb ip, r7, r4, lsl #3 sub r4, r7, r4, lsl #3 @@ -368,10 +372,10 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1 pop {r4-r6, pc} 2: .ifc \type,put - ldrh r5, [r1], r2 - strh r5, [r0], r2 - ldrh r6, [r1], r2 - strh r6, [r0], r2 + ldrh_post r5, r1, r2 + strh_post r5, r0, r2 + ldrh_post r6, r1, r2 + strh_post r6, r0, r2 .else vld1.16 {d16[0]}, [r1], r2 vld1.16 {d16[1]}, [r1], r2 @@ -404,28 +408,17 @@ endfunc ldr ip, [sp] tst r2, r2 ldr ip, [ip] + it ne tstne r3, r3 vmov.32 d24[0], ip and ip, ip, ip, lsl #16 + it eq bxeq lr ands ip, ip, ip, lsl #8 + it lt bxlt lr .endm - .macro align_push_regs - and ip, sp, #15 - add ip, ip, #32 - sub sp, sp, ip - vst1.64 {d12-d15}, [sp,:128] - sub sp, sp, #32 - vst1.64 {d8-d11}, [sp,:128] - .endm - - .macro align_pop_regs - vld1.64 {d8-d11}, [sp,:128]! - vld1.64 {d12-d15}, [sp,:128], ip - .endm - .macro h264_loop_filter_luma vdup.8 q11, r2 @ alpha vmovl.u8 q12, d24 @@ -506,7 +499,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1 vld1.64 {d18,d19}, [r0,:128], r1 vld1.64 {d16,d17}, [r0,:128], r1 - align_push_regs + vpush {d8-d15} h264_loop_filter_luma @@ -516,7 +509,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1 vst1.64 {d0, d1}, [r0,:128], r1 vst1.64 {d10,d11}, [r0,:128] - align_pop_regs + vpop {d8-d15} bx lr endfunc @@ -543,7 +536,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 - align_push_regs + vpush {d8-d15} h264_loop_filter_luma @@ -568,7 +561,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1 vst1.32 {d1[1]}, [r0], r1 vst1.32 {d11[1]}, [r0], r1 - align_pop_regs + vpop {d8-d15} bx lr endfunc @@ -1116,6 +1109,7 @@ function \type\()_h264_qpel8_hv_lowpass_neon vrhadd.u8 d11, d11, d7 sub r0, r0, r2, lsl #3 .endif + vst1.64 {d12}, [r0,:64], r2 vst1.64 {d13}, [r0,:64], r2 vst1.64 {d14}, [r0,:64], r2 @@ -1263,7 +1257,9 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1 \type\()_h264_qpel8_mc11: lowpass_const r3 mov r11, sp - bic sp, sp, #15 +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 sub sp, sp, #64 mov r0, sp sub r1, r1, #2 @@ -1271,14 +1267,14 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1 mov ip, #8 vpush {d8-d15} bl put_h264_qpel8_h_lowpass_neon - ldrd r0, [r11] + ldrd r0, [r11], #8 mov r3, r2 add ip, sp, #64 sub r1, r1, r2, lsl #1 mov r2, #8 bl \type\()_h264_qpel8_v_lowpass_l2_neon vpop {d8-d15} - add sp, r11, #8 + mov sp, r11 pop {r11, pc} endfunc @@ -1287,7 +1283,9 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1 \type\()_h264_qpel8_mc21: lowpass_const r3 mov r11, sp - bic sp, sp, #15 +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 sub sp, sp, #(8*8+16*12) sub r1, r1, #2 mov r3, #8 @@ -1296,14 +1294,14 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1 vpush {d8-d15} bl put_h264_qpel8_h_lowpass_neon mov r4, r0 - ldrd r0, [r11] + ldrd r0, [r11], #8 sub r1, r1, r2, lsl #1 sub r1, r1, #2 mov r3, r2 sub r2, r4, #64 bl \type\()_h264_qpel8_hv_lowpass_l2_neon vpop {d8-d15} - add sp, r11, #8 + mov sp, r11 pop {r4, r10, r11, pc} endfunc @@ -1330,7 +1328,9 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1 \type\()_h264_qpel8_mc12: lowpass_const r3 mov r11, sp - bic sp, sp, #15 +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 sub sp, sp, #(8*8+16*12) sub r1, r1, r2, lsl #1 mov r3, r2 @@ -1339,20 +1339,22 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1 vpush {d8-d15} bl put_h264_qpel8_v_lowpass_neon mov r4, r0 - ldrd r0, [r11] + ldrd r0, [r11], #8 sub r1, r1, r3, lsl #1 sub r1, r1, #2 sub r2, r4, #64 bl \type\()_h264_qpel8_hv_lowpass_l2_neon vpop {d8-d15} - add sp, r11, #8 + mov sp, r11 pop {r4, r10, r11, pc} endfunc function ff_\type\()_h264_qpel8_mc22_neon, export=1 push {r4, r10, r11, lr} mov r11, sp - bic sp, sp, #15 +A bic sp, sp, #15 +T bic r4, r11, #15 +T mov sp, r4 sub r1, r1, r2, lsl #1 sub r1, r1, #2 mov r3, r2 @@ -1441,21 +1443,23 @@ function ff_\type\()_h264_qpel16_mc11_neon, export=1 \type\()_h264_qpel16_mc11: lowpass_const r3 mov r11, sp - bic sp, sp, #15 +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 sub sp, sp, #256 mov r0, sp sub r1, r1, #2 mov r3, #16 vpush {d8-d15} bl put_h264_qpel16_h_lowpass_neon - ldrd r0, [r11] + ldrd r0, [r11], #8 mov r3, r2 add ip, sp, #64 sub r1, r1, r2, lsl #1 mov r2, #16 bl \type\()_h264_qpel16_v_lowpass_l2_neon vpop {d8-d15} - add sp, r11, #8 + mov sp, r11 pop {r4, r11, pc} endfunc @@ -1464,20 +1468,22 @@ function ff_\type\()_h264_qpel16_mc21_neon, export=1 \type\()_h264_qpel16_mc21: lowpass_const r3 mov r11, sp - bic sp, sp, #15 +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 sub sp, sp, #(16*16+16*12) sub r1, r1, #2 mov r0, sp vpush {d8-d15} bl put_h264_qpel16_h_lowpass_neon_packed mov r4, r0 - ldrd r0, [r11] + ldrd r0, [r11], #8 sub r1, r1, r2, lsl #1 sub r1, r1, #2 mov r3, r2 bl \type\()_h264_qpel16_hv_lowpass_l2_neon vpop {d8-d15} - add sp, r11, #8 + mov sp, r11 pop {r4-r5, r9-r11, pc} endfunc @@ -1504,7 +1510,9 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1 \type\()_h264_qpel16_mc12: lowpass_const r3 mov r11, sp - bic sp, sp, #15 +A bic sp, sp, #15 +T bic r0, r11, #15 +T mov sp, r0 sub sp, sp, #(16*16+16*12) sub r1, r1, r2, lsl #1 mov r0, sp @@ -1512,13 +1520,13 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1 vpush {d8-d15} bl put_h264_qpel16_v_lowpass_neon_packed mov r4, r0 - ldrd r0, [r11] + ldrd r0, [r11], #8 sub r1, r1, r3, lsl #1 sub r1, r1, #2 mov r2, r3 bl \type\()_h264_qpel16_hv_lowpass_l2_neon vpop {d8-d15} - add sp, r11, #8 + mov sp, r11 pop {r4-r5, r9-r11, pc} endfunc @@ -1526,7 +1534,9 @@ function ff_\type\()_h264_qpel16_mc22_neon, export=1 push {r4, r9-r11, lr} lowpass_const r3 mov r11, sp - bic sp, sp, #15 +A bic sp, sp, #15 +T bic r4, r11, #15 +T mov sp, r4 sub r1, r1, r2, lsl #1 sub r1, r1, #2 mov r3, r2 diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S index 3c743e1607..eadf2e711d 100644 --- a/libavcodec/arm/h264idct_neon.S +++ b/libavcodec/arm/h264idct_neon.S @@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1 blt 2f ldrsh lr, [r1] add r0, r0, r4 + it ne movne lr, #0 cmp lr, #0 - adrne lr, ff_h264_idct_dc_add_neon - adreq lr, ff_h264_idct_add_neon + ite ne + adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB + adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB blx lr 2: subs ip, ip, #1 add r1, r1, #32 @@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1 add r0, r0, r4 cmp r8, #0 ldrsh r8, [r1] - adrne lr, ff_h264_idct_add_neon - adreq lr, ff_h264_idct_dc_add_neon + iteet ne + adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB + adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB cmpeq r8, #0 blxne lr subs ip, ip, #1 @@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1 add r1, r3, r12, lsl #5 cmp r8, #0 ldrsh r8, [r1] - adrne lr, ff_h264_idct_add_neon - adreq lr, ff_h264_idct_dc_add_neon + iteet ne + adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB + adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB cmpeq r8, #0 blxne lr add r12, r12, #1 cmp r12, #4 + itt eq moveq r12, #16 moveq r4, r9 cmp r12, #20 @@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1 blt 2f ldrsh lr, [r1] add r0, r0, r4 + it ne movne lr, #0 cmp lr, #0 - adrne lr, ff_h264_idct8_dc_add_neon - adreq lr, ff_h264_idct8_add_neon + ite ne + adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB + adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB blx lr 2: subs r12, r12, #4 add r1, r1, #128 diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h index b27b18f871..3803fcde8c 100644 --- a/libavcodec/arm/mathops.h +++ b/libavcodec/arm/mathops.h @@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c) __asm__ ( "mov %0, %2 \n\t" "cmp %1, %2 \n\t" + "itt gt \n\t" "movgt %0, %1 \n\t" "movgt %1, %2 \n\t" "cmp %1, %3 \n\t" + "it le \n\t" "movle %1, %3 \n\t" "cmp %0, %1 \n\t" + "it gt \n\t" "movgt %0, %1 \n\t" : "=&r"(m), "+r"(a) : "r"(b), "r"(c) diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S index c375f4c121..1ba3067c4e 100644 --- a/libavcodec/arm/mdct_neon.S +++ b/libavcodec/arm/mdct_neon.S @@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1 vadd.f32 d17, d17, d3 @ in2u+in1d -I 1: vmul.f32 d7, d0, d21 @ I*s - ldr r10, [r3, lr, lsr #1] +A ldr r10, [r3, lr, lsr #1] +T lsr r10, lr, #1 +T ldr r10, [r3, r10] vmul.f32 d6, d1, d20 @ -R*c ldr r6, [r3, #4]! vmul.f32 d4, d1, d21 @ -R*s diff --git a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S index 9ec731480b..b517b973e7 100644 --- a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S +++ b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S @@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1 sum8 r8, r9, r1, r0, r10, r11, r12, lr sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32 round r10, r8, r9 - strh r10, [r3], r4 + strh_post r10, r3, r4 mov lr, #15 1: @@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1 round r10, r8, r9 adds r8, r8, r4 adc r9, r9, r7 - strh r10, [r3], r12 + strh_post r10, r3, r12 round r11, r8, r9 subs lr, lr, #1 - strh r11, [r5], -r12 + strh_dpost r11, r5, r12 bgt 1b sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33 diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S index e3461601d5..952c8d74cb 100644 --- a/libavcodec/arm/mpegvideo_armv5te_s.S +++ b/libavcodec/arm/mpegvideo_armv5te_s.S @@ -38,15 +38,21 @@ .macro dequant_t dst, src, mul, add, tmp rsbs \tmp, ip, \src, asr #16 + it gt addgt \tmp, \add, #0 + it lt rsblt \tmp, \add, #0 + it ne smlatbne \dst, \src, \mul, \tmp .endm .macro dequant_b dst, src, mul, add, tmp rsbs \tmp, ip, \src, lsl #16 + it gt addgt \tmp, \add, #0 + it lt rsblt \tmp, \add, #0 + it ne smlabbne \dst, \src, \mul, \tmp .endm @@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1 strh lr, [r0], #2 subs r3, r3, #8 + it gt ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ bgt 1b adds r3, r3, #2 + it le pople {r4-r9,pc} 2: ldrsh r9, [r0, #0] ldrsh lr, [r0, #2] mov r8, r2 cmp r9, #0 + it lt rsblt r8, r2, #0 + it ne smlabbne r9, r9, r1, r8 mov r8, r2 cmp lr, #0 + it lt rsblt r8, r2, #0 + it ne smlabbne lr, lr, r1, r8 strh r9, [r0], #2 strh lr, [r0], #2 diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S index 365dcf65cf..206a71a14d 100644 --- a/libavcodec/arm/mpegvideo_neon.S +++ b/libavcodec/arm/mpegvideo_neon.S @@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1 subs r3, r3, #16 vst1.16 {q0}, [r1,:128]! vst1.16 {q8}, [r1,:128]! + it le bxle lr cmp r3, #8 bgt 1b @@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1 ldr r6, [r0, #AC_PRED] add lr, r0, #INTER_SCANTAB_RASTER_END cmp r6, #0 + it ne movne r12, #63 bne 1f ldr r12, [r12, r2, lsl #2] @@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1 ldrsh r4, [r1] cmp r5, #0 mov r5, r1 + it ne movne r2, #0 bne 2f cmp r2, #4 + it ge addge r0, r0, #4 sub r2, r3, #1 ldr r6, [r0, #Y_DC_SCALE] diff --git a/libavcodec/arm/rdft_neon.S b/libavcodec/arm/rdft_neon.S index 8aafdc9cf4..fba275eb8c 100644 --- a/libavcodec/arm/rdft_neon.S +++ b/libavcodec/arm/rdft_neon.S @@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1 vst1.32 {d22}, [r5,:64] cmp r6, #0 + it eq popeq {r4-r8,pc} vmul.f32 d22, d22, d18 diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S index 4e6dfa4391..717b12c64b 100644 --- a/libavcodec/arm/simple_idct_arm.S +++ b/libavcodec/arm/simple_idct_arm.S @@ -121,11 +121,13 @@ __b_evaluation: ldr r11, [r12, #offW7] @ R11=W7 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - teq r2, #0 @ if null avoid muls - mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + teq r2, #0 @ if null avoid muls + itttt ne + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) rsbne r2, r2, #0 @ R2=-ROWr16[3] mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + it ne mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], @@ -148,19 +150,23 @@ __b_evaluation: @@ MAC16(b3, -W1, row[7]); @@ MAC16(b1, -W5, row[7]); mov r3, r3, asr #16 @ R3=ROWr16[5] - teq r3, #0 @ if null avoid muls + teq r3, #0 @ if null avoid muls + it ne mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 mov r4, r4, asr #16 @ R4=ROWr16[7] + itttt ne mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 rsbne r3, r3, #0 @ R3=-ROWr16[5] mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 @@ R3 is free now - teq r4, #0 @ if null avoid muls + teq r4, #0 @ if null avoid muls + itttt ne mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 rsbne r4, r4, #0 @ R4=-ROWr16[7] mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 + it ne mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 @@ R4 is free now __end_b_evaluation: @@ -204,16 +210,19 @@ __a_evaluation: @@ a2 -= W4*row[4] @@ a3 += W4*row[4] ldrsh r11, [r14, #8] @ R11=ROWr16[4] - teq r11, #0 @ if null avoid muls + teq r11, #0 @ if null avoid muls + it ne mulne r11, r9, r11 @ R11=W4*ROWr16[4] @@ R9 is free now ldrsh r9, [r14, #12] @ R9=ROWr16[6] + itttt ne addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead - teq r9, #0 @ if null avoid muls + teq r9, #0 @ if null avoid muls + itttt ne mulne r11, r10, r9 @ R11=W6*ROWr16[6] addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) mulne r10, r8, r9 @ R10=W2*ROWr16[6] @@ -222,6 +231,7 @@ __a_evaluation: @@ a1 -= W2*row[6]; @@ a2 += W2*row[6]; subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) + itt ne subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) @@ -323,10 +333,12 @@ __b_evaluation2: ldrsh r2, [r14, #48] mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) teq r2, #0 @ if 0, then avoid muls + itttt ne mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) rsbne r2, r2, #0 @ R2=-ROWr16[3] mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) + it ne mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), @@ -342,18 +354,22 @@ __b_evaluation2: @@ MAC16(b1, -W5, col[7x8]); ldrsh r3, [r14, #80] @ R3=COLr16[5x8] teq r3, #0 @ if 0 then avoid muls + itttt ne mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 rsbne r3, r3, #0 @ R3=-ROWr16[5x8] ldrsh r4, [r14, #112] @ R4=COLr16[7x8] + it ne mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 @@ R3 is free now teq r4, #0 @ if 0 then avoid muls + itttt ne mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 rsbne r4, r4, #0 @ R4=-ROWr16[7x8] mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 + it ne mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 @@ R4 is free now __end_b_evaluation2: @@ -390,15 +406,18 @@ __a_evaluation2: @@ a3 += W4*row[4] ldrsh r11, [r14, #64] @ R11=ROWr16[4] teq r11, #0 @ if null avoid muls + itttt ne mulne r11, r9, r11 @ R11=W4*ROWr16[4] @@ R9 is free now addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) ldrsh r9, [r14, #96] @ R9=ROWr16[6] + it ne addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead teq r9, #0 @ if null avoid muls + itttt ne mulne r11, r10, r9 @ R11=W6*ROWr16[6] addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) mulne r10, r8, r9 @ R10=W2*ROWr16[6] @@ -407,6 +426,7 @@ __a_evaluation2: @@ a1 -= W2*row[6]; @@ a2 += W2*row[6]; subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) + itt ne subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) __end_a_evaluation2: diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S index 29ebf5c03c..24641e47b6 100644 --- a/libavcodec/arm/simple_idct_armv5te.S +++ b/libavcodec/arm/simple_idct_armv5te.S @@ -49,6 +49,7 @@ function idct_row_armv5te ldrd v1, [a1, #8] ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ orrs v1, v1, v2 + itt eq cmpeq v1, a4 cmpeq v1, a3, lsr #16 beq row_dc_only @@ -269,6 +270,7 @@ function idct_col_armv5te ldmfd sp!, {a3, a4} adds a2, a3, v1 mov a2, a2, lsr #20 + it mi orrmi a2, a2, #0xf000 add ip, a4, v2 mov ip, ip, asr #20 @@ -276,6 +278,7 @@ function idct_col_armv5te str a2, [a1] subs a3, a3, v1 mov a2, a3, lsr #20 + it mi orrmi a2, a2, #0xf000 sub a4, a4, v2 mov a4, a4, asr #20 @@ -285,6 +288,7 @@ function idct_col_armv5te subs a2, a3, v3 mov a2, a2, lsr #20 + it mi orrmi a2, a2, #0xf000 sub ip, a4, v4 mov ip, ip, asr #20 @@ -292,6 +296,7 @@ function idct_col_armv5te str a2, [a1, #(16*1)] adds a3, a3, v3 mov a2, a3, lsr #20 + it mi orrmi a2, a2, #0xf000 add a4, a4, v4 mov a4, a4, asr #20 @@ -301,6 +306,7 @@ function idct_col_armv5te adds a2, a3, v5 mov a2, a2, lsr #20 + it mi orrmi a2, a2, #0xf000 add ip, a4, v6 mov ip, ip, asr #20 @@ -308,6 +314,7 @@ function idct_col_armv5te str a2, [a1, #(16*2)] subs a3, a3, v5 mov a2, a3, lsr #20 + it mi orrmi a2, a2, #0xf000 sub a4, a4, v6 mov a4, a4, asr #20 @@ -317,6 +324,7 @@ function idct_col_armv5te adds a2, a3, v7 mov a2, a2, lsr #20 + it mi orrmi a2, a2, #0xf000 add ip, a4, fp mov ip, ip, asr #20 @@ -324,6 +332,7 @@ function idct_col_armv5te str a2, [a1, #(16*3)] subs a3, a3, v7 mov a2, a3, lsr #20 + it mi orrmi a2, a2, #0xf000 sub a4, a4, fp mov a4, a4, asr #20 @@ -335,15 +344,19 @@ endfunc .macro clip dst, src:vararg movs \dst, \src + it mi movmi \dst, #0 cmp \dst, #255 + it gt movgt \dst, #255 .endm .macro aclip dst, src:vararg adds \dst, \src + it mi movmi \dst, #0 cmp \dst, #255 + it gt movgt \dst, #255 .endm @@ -370,35 +383,35 @@ function idct_col_put_armv5te orr a2, a3, a4, lsl #8 rsb v2, lr, lr, lsl #3 ldmfd sp!, {a3, a4} - strh a2, [v2, v1]! + strh_pre a2, v2, v1 sub a2, a3, v3 clip a2, a2, asr #20 sub ip, a4, v4 clip ip, ip, asr #20 orr a2, a2, ip, lsl #8 - strh a2, [v1, lr]! + strh_pre a2, v1, lr add a3, a3, v3 clip a2, a3, asr #20 add a4, a4, v4 clip a4, a4, asr #20 orr a2, a2, a4, lsl #8 ldmfd sp!, {a3, a4} - strh a2, [v2, -lr]! + strh_dpre a2, v2, lr add a2, a3, v5 clip a2, a2, asr #20 add ip, a4, v6 clip ip, ip, asr #20 orr a2, a2, ip, lsl #8 - strh a2, [v1, lr]! + strh_pre a2, v1, lr sub a3, a3, v5 clip a2, a3, asr #20 sub a4, a4, v6 clip a4, a4, asr #20 orr a2, a2, a4, lsl #8 ldmfd sp!, {a3, a4} - strh a2, [v2, -lr]! + strh_dpre a2, v2, lr add a2, a3, v7 clip a2, a2, asr #20 @@ -411,7 +424,7 @@ function idct_col_put_armv5te sub a4, a4, fp clip a4, a4, asr #20 orr a2, a2, a4, lsl #8 - strh a2, [v2, -lr] + strh_dpre a2, v2, lr ldr pc, [sp], #4 endfunc @@ -436,7 +449,7 @@ function idct_col_add_armv5te ldr v1, [sp, #32] sub a4, a4, v2 rsb v2, v1, v1, lsl #3 - ldrh ip, [v2, lr]! + ldrh_pre ip, v2, lr strh a2, [lr] and a2, ip, #255 aclip a3, a2, a3, asr #20 @@ -448,7 +461,7 @@ function idct_col_add_armv5te strh a2, [v2] ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! + ldrh_pre ip, lr, v1 sub a2, a3, v3 add a3, a3, v3 and v3, ip, #255 @@ -458,7 +471,7 @@ function idct_col_add_armv5te aclip v3, v3, ip, lsr #8 orr a2, a2, v3, lsl #8 add a4, a4, v4 - ldrh ip, [v2, -v1]! + ldrh_dpre ip, v2, v1 strh a2, [lr] and a2, ip, #255 aclip a3, a2, a3, asr #20 @@ -468,7 +481,7 @@ function idct_col_add_armv5te strh a2, [v2] ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! + ldrh_pre ip, lr, v1 add a2, a3, v5 sub a3, a3, v5 and v3, ip, #255 @@ -478,7 +491,7 @@ function idct_col_add_armv5te aclip v3, v3, ip, lsr #8 orr a2, a2, v3, lsl #8 sub a4, a4, v6 - ldrh ip, [v2, -v1]! + ldrh_dpre ip, v2, v1 strh a2, [lr] and a2, ip, #255 aclip a3, a2, a3, asr #20 @@ -488,7 +501,7 @@ function idct_col_add_armv5te strh a2, [v2] ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! + ldrh_pre ip, lr, v1 add a2, a3, v7 sub a3, a3, v7 and v3, ip, #255 @@ -498,7 +511,7 @@ function idct_col_add_armv5te aclip v3, v3, ip, lsr #8 orr a2, a2, v3, lsl #8 sub a4, a4, fp - ldrh ip, [v2, -v1]! + ldrh_dpre ip, v2, v1 strh a2, [lr] and a2, ip, #255 aclip a3, a2, a3, asr #20 diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S index b2eb525de5..284eb1f941 100644 --- a/libavcodec/arm/simple_idct_armv6.S +++ b/libavcodec/arm/simple_idct_armv6.S @@ -200,6 +200,7 @@ function idct_row_armv6 ldr r3, [r0, #8] /* r3 = row[3,1] */ ldr r2, [r0] /* r2 = row[2,0] */ orrs lr, lr, ip + itt eq cmpeq lr, r3 cmpeq lr, r2, lsr #16 beq 1f @@ -282,14 +283,14 @@ function idct_col_put_armv6 pop {r1, r2} idct_finish_shift_sat COL_SHIFT - strb r4, [r1], r2 - strb r5, [r1], r2 - strb r6, [r1], r2 - strb r7, [r1], r2 - strb r11,[r1], r2 - strb r10,[r1], r2 - strb r9, [r1], r2 - strb r8, [r1], r2 + strb_post r4, r1, r2 + strb_post r5, r1, r2 + strb_post r6, r1, r2 + strb_post r7, r1, r2 + strb_post r11,r1, r2 + strb_post r10,r1, r2 + strb_post r9, r1, r2 + strb_post r8, r1, r2 sub r1, r1, r2, lsl #3 @@ -318,16 +319,16 @@ function idct_col_add_armv6 add ip, r3, ip, asr #COL_SHIFT usat ip, #8, ip add r4, r7, r4, asr #COL_SHIFT - strb ip, [r1], r2 + strb_post ip, r1, r2 ldrb ip, [r1, r2] usat r4, #8, r4 ldrb r11,[r1, r2, lsl #2] add r5, ip, r5, asr #COL_SHIFT usat r5, #8, r5 - strb r4, [r1], r2 + strb_post r4, r1, r2 ldrb r3, [r1, r2] ldrb ip, [r1, r2, lsl #2] - strb r5, [r1], r2 + strb_post r5, r1, r2 ldrb r7, [r1, r2] ldrb r4, [r1, r2, lsl #2] add r6, r3, r6, asr #COL_SHIFT @@ -340,11 +341,11 @@ function idct_col_add_armv6 usat r8, #8, r8 add lr, r4, lr, asr #COL_SHIFT usat lr, #8, lr - strb r6, [r1], r2 - strb r10,[r1], r2 - strb r9, [r1], r2 - strb r8, [r1], r2 - strb lr, [r1], r2 + strb_post r6, r1, r2 + strb_post r10,r1, r2 + strb_post r9, r1, r2 + strb_post r8, r1, r2 + strb_post lr, r1, r2 sub r1, r1, r2, lsl #3 diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S index 0628b9676a..cbed9eefe4 100644 --- a/libavcodec/arm/simple_idct_neon.S +++ b/libavcodec/arm/simple_idct_neon.S @@ -71,7 +71,7 @@ function idct_row4_pld_neon add r3, r0, r1, lsl #2 pld [r0, r1] pld [r0, r1, lsl #1] - pld [r3, -r1] +A pld [r3, -r1] pld [r3] pld [r3, r1] add r3, r3, r1, lsl #1 @@ -164,6 +164,7 @@ function idct_col4_neon orrs r4, r4, r5 idct_col4_top + it eq addeq r2, r2, #16 beq 1f @@ -176,6 +177,7 @@ function idct_col4_neon 1: orrs r6, r6, r7 ldrd r4, [r2, #16] + it eq addeq r2, r2, #16 beq 2f @@ -187,6 +189,7 @@ function idct_col4_neon 2: orrs r4, r4, r5 ldrd r4, [r2, #16] + it eq addeq r2, r2, #16 beq 3f @@ -199,6 +202,7 @@ function idct_col4_neon vadd.i32 q13, q13, q8 3: orrs r4, r4, r5 + it eq addeq r2, r2, #16 beq 4f diff --git a/libavcodec/arm/synth_filter_neon.S b/libavcodec/arm/synth_filter_neon.S index d4f67b785c..1d6e5b2b86 100644 --- a/libavcodec/arm/synth_filter_neon.S +++ b/libavcodec/arm/synth_filter_neon.S @@ -100,9 +100,11 @@ NOVFP vldr s0, [sp, #12*4] @ scale vst1.32 {q9}, [r2,:128] subs r1, r1, #1 + it eq popeq {r4-r11,pc} cmp r4, #0 + itt eq subeq r8, r8, #512*4 subeq r9, r9, #512*4 sub r5, r5, #512*4 diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h index 9ce3fd0d91..ef30ffe897 100644 --- a/libavcodec/arm/vp56_arith.h +++ b/libavcodec/arm/vp56_arith.h @@ -21,6 +21,14 @@ #ifndef AVCODEC_ARM_VP56_ARITH_H #define AVCODEC_ARM_VP56_ARITH_H +#if CONFIG_THUMB +# define A(x) +# define T(x) x +#else +# define A(x) x +# define T(x) +#endif + #if HAVE_ARMV6 && HAVE_INLINE_ASM #define vp56_rac_get_prob vp56_rac_get_prob_armv6 @@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr) unsigned bit; __asm__ ("adds %3, %3, %0 \n" + "itt cs \n" "cmpcs %7, %4 \n" - "ldrcsh %2, [%4], #2 \n" + A("ldrcsh %2, [%4], #2 \n") + T("ldrhcs %2, [%4], #2 \n") "rsb %0, %6, #256 \n" "smlabb %0, %5, %6, %0 \n" + T("itttt cs \n") "rev16cs %2, %2 \n" - "orrcs %1, %1, %2, lsl %3 \n" + T("lslcs %2, %2, %3 \n") + T("orrcs %1, %1, %2 \n") + A("orrcs %1, %1, %2, lsl %3 \n") "subcs %3, %3, #16 \n" "lsr %0, %0, #8 \n" "cmp %1, %0, lsl #16 \n" + "ittte ge \n" "subge %1, %1, %0, lsl #16 \n" "subge %0, %5, %0 \n" "movge %2, #1 \n" @@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr) unsigned tmp; __asm__ ("adds %3, %3, %0 \n" + "itt cs \n" "cmpcs %7, %4 \n" - "ldrcsh %2, [%4], #2 \n" + A("ldrcsh %2, [%4], #2 \n") + T("ldrhcs %2, [%4], #2 \n") "rsb %0, %6, #256 \n" "smlabb %0, %5, %6, %0 \n" + T("itttt cs \n") "rev16cs %2, %2 \n" - "orrcs %1, %1, %2, lsl %3 \n" + T("lslcs %2, %2, %3 \n") + T("orrcs %1, %1, %2 \n") + A("orrcs %1, %1, %2, lsl %3 \n") "subcs %3, %3, #16 \n" "lsr %0, %0, #8 \n" "lsl %2, %0, #16 \n" diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S index 602c8a58be..1d89c68909 100644 --- a/libavcodec/arm/vp8_armv6.S +++ b/libavcodec/arm/vp8_armv6.S @@ -25,13 +25,18 @@ lsl \cw, \cw, \t0 lsl \t0, \h, \t0 rsb \h, \pr, #256 + it cs ldrhcs \t1, [\buf], #2 smlabb \h, \t0, \pr, \h +T itttt cs rev16cs \t1, \t1 - orrcs \cw, \cw, \t1, lsl \bs +A orrcs \cw, \cw, \t1, lsl \bs +T lslcs \t1, \t1, \bs +T orrcs \cw, \cw, \t1 subcs \bs, \bs, #16 lsr \h, \h, #8 cmp \cw, \h, lsl #16 + itt ge subge \cw, \cw, \h, lsl #16 subge \h, \t0, \h .endm @@ -40,14 +45,20 @@ adds \bs, \bs, \t0 lsl \cw, \cw, \t0 lsl \t0, \h, \t0 + it cs ldrhcs \t1, [\buf], #2 mov \h, #128 + it cs rev16cs \t1, \t1 add \h, \h, \t0, lsl #7 - orrcs \cw, \cw, \t1, lsl \bs +A orrcs \cw, \cw, \t1, lsl \bs +T ittt cs +T lslcs \t1, \t1, \bs +T orrcs \cw, \cw, \t1 subcs \bs, \bs, #16 lsr \h, \h, #8 cmp \cw, \h, lsl #16 + itt ge subge \cw, \cw, \h, lsl #16 subge \h, \t0, \h .endm @@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1 cmp r3, #0 ldr r11, [r5] ldm r0, {r5-r7} @ high, bits, buf + it ne pkhtbne r11, r11, r11, asr #16 ldr r8, [r0, #16] @ code_word 0: @@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1 adds r6, r6, r9 add r4, r4, #11 lsl r8, r8, r9 + it cs ldrhcs r10, [r7], #2 lsl r9, r5, r9 mov r5, #128 + it cs rev16cs r10, r10 add r5, r5, r9, lsl #7 - orrcs r8, r8, r10, lsl r6 +T ittt cs +T lslcs r10, r10, r6 +T orrcs r8, r8, r10 +A orrcs r8, r8, r10, lsl r6 subcs r6, r6, #16 lsr r5, r5, #8 cmp r8, r5, lsl #16 movrel r10, zigzag_scan-1 + itt ge subge r8, r8, r5, lsl #16 subge r5, r9, r5 ldrb r10, [r10, r3] + it ge rsbge r12, r12, #0 cmp r3, #16 strh r12, [r1, r10] @@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1 ldr r0, [sp] ldr r9, [r0, #12] cmp r7, r9 + it hi movhi r7, r9 stm r0, {r5-r7} @ high, bits, buf str r8, [r0, #16] @ code_word @@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1 mov r12, #2 ldrb r0, [r4, #4] rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge addge r12, #1 ldrb r9, [lr, r5] blt 4f ldrb r0, [r4, #5] rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge addge r12, #1 ldrb r9, [lr, r5] b 4f @@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1 mov r12, #5 mov r0, #159 rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge addge r12, r12, #1 ldrb r9, [lr, r5] b 4f @@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1 mov r12, #7 mov r0, #165 rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge addge r12, r12, #2 ldrb r9, [lr, r5] mov r0, #145 rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge addge r12, r12, #1 ldrb r9, [lr, r5] b 4f 3: ldrb r0, [r4, #8] rac_get_prob r5, r6, r7, r8, r0, r9, r10 + it ge addge r4, r4, #1 ldrb r9, [lr, r5] + ite ge movge r12, #2 movlt r12, #0 ldrb r0, [r4, #9] rac_get_prob r5, r6, r7, r8, r0, r9, r10 mov r9, #8 + it ge addge r12, r12, #1 movrel r4, X(ff_vp8_dct_cat_prob) lsl r9, r9, r12 @@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1 lsl r1, r1, #1 rac_get_prob r5, r6, r7, r8, r0, r9, r10 ldrb r0, [r4], #1 + it ge addge r1, r1, #1 cmp r0, #0 bne 1b @@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1 add r4, r2, r4 add r4, r4, #22 rac_get_128 r5, r6, r7, r8, r9, r10 + it ge rsbge r12, r12, #0 smulbb r12, r12, r11 movrel r9, zigzag_scan-1 diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S index 92b1ef4eb7..1b9f24eef2 100644 --- a/libavcodec/arm/vp8dsp_neon.S +++ b/libavcodec/arm/vp8dsp_neon.S @@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1 push {r4-r6,lr} 1: subs r12, r12, #4 - ldr r4, [r2], r3 - ldr r5, [r2], r3 - ldr r6, [r2], r3 - ldr lr, [r2], r3 - str r4, [r0], r1 - str r5, [r0], r1 - str r6, [r0], r1 - str lr, [r0], r1 + ldr_post r4, r2, r3 + ldr_post r5, r2, r3 + ldr_post r6, r2, r3 + ldr_post lr, r2, r3 + str_post r4, r0, r1 + str_post r5, r0, r1 + str_post r6, r0, r1 + str_post lr, r0, r1 bgt 1b pop {r4-r6,pc} endfunc -- cgit v1.2.3