56 files changed, 570 insertions, 300 deletions
diff --git a/libavcodec/acelp_pitch_delay.h b/libavcodec/acelp_pitch_delay.h
index ce06bc2539..72977f1f49 100644
--- a/libavcodec/acelp_pitch_delay.h
+++ b/libavcodec/acelp_pitch_delay.h
@@ -30,11 +30,11 @@
 #define PITCH_DELAY_MAX             143
 
 /**
- * \brief Decode pitch delay of the first subframe encoded by 8 bits with 1/3
+ * @brief Decode pitch delay of the first subframe encoded by 8 bits with 1/3
  *        resolution.
- * \param ac_index adaptive codebook index (8 bits)
+ * @param ac_index adaptive codebook index (8 bits)
  *
- * \return pitch delay in 1/3 units
+ * @return pitch delay in 1/3 units
  *
  * Pitch delay is coded:
  *    with 1/3 resolution, 19  < pitch_delay <  85
@@ -43,18 +43,18 @@
 int ff_acelp_decode_8bit_to_1st_delay3(int ac_index);
 
 /**
- * \brief Decode pitch delay of the second subframe encoded by 5 or 6 bits
+ * @brief Decode pitch delay of the second subframe encoded by 5 or 6 bits
  *        with 1/3 precision.
- * \param ac_index adaptive codebook index (5 or 6 bits)
- * \param pitch_delay_min lower bound (integer) of pitch delay interval
+ * @param ac_index adaptive codebook index (5 or 6 bits)
+ * @param pitch_delay_min lower bound (integer) of pitch delay interval
  *                      for second subframe
  *
- * \return pitch delay in 1/3 units
+ * @return pitch delay in 1/3 units
  *
  * Pitch delay is coded:
  *    with 1/3 resolution, -6 < pitch_delay - int(prev_pitch_delay) < 5
  *
- * \remark The routine is used in G.729 @@8k, AMR @@10.2k, AMR @@7.95k,
+ * @remark The routine is used in G.729 @@8k, AMR @@10.2k, AMR @@7.95k,
  *         AMR @@7.4k for the second subframe.
  */
 int ff_acelp_decode_5_6_bit_to_2nd_delay3(
@@ -62,19 +62,19 @@ int ff_acelp_decode_5_6_bit_to_2nd_delay3(
         int pitch_delay_min);
 
 /**
- * \brief Decode pitch delay with 1/3 precision.
- * \param ac_index adaptive codebook index (4 bits)
- * \param pitch_delay_min lower bound (integer) of pitch delay interval for
+ * @brief Decode pitch delay with 1/3 precision.
+ * @param ac_index adaptive codebook index (4 bits)
+ * @param pitch_delay_min lower bound (integer) of pitch delay interval for
  *                      second subframe
  *
- * \return pitch delay in 1/3 units
+ * @return pitch delay in 1/3 units
  *
  * Pitch delay is coded:
  *    integers only,          -6  < pitch_delay - int(prev_pitch_delay) <= -2
  *    with 1/3 resolution,    -2  < pitch_delay - int(prev_pitch_delay) <  1
  *    integers only,           1 <= pitch_delay - int(prev_pitch_delay) <  5
  *
- * \remark The routine is used in G.729 @@6.4k, AMR @@6.7k, AMR @@5.9k,
+ * @remark The routine is used in G.729 @@6.4k, AMR @@6.7k, AMR @@5.9k,
  *         AMR @@5.15k, AMR @@4.75k for the second subframe.
  */
 int ff_acelp_decode_4bit_to_2nd_delay3(
@@ -82,44 +82,44 @@ int ff_acelp_decode_4bit_to_2nd_delay3(
         int pitch_delay_min);
 
 /**
- * \brief Decode pitch delay of the first subframe encoded by 9 bits
+ * @brief Decode pitch delay of the first subframe encoded by 9 bits
  *        with 1/6 precision.
- * \param ac_index adaptive codebook index (9 bits)
+ * @param ac_index adaptive codebook index (9 bits)
  *
- * \return pitch delay in 1/6 units
+ * @return pitch delay in 1/6 units
  *
  * Pitch delay is coded:
  *    with 1/6 resolution,  17  < pitch_delay <  95
  *    integers only,        95 <= pitch_delay <= 143
  *
- * \remark The routine is used in AMR @@12.2k for the first and third subframes.
+ * @remark The routine is used in AMR @@12.2k for the first and third subframes.
  */
 int ff_acelp_decode_9bit_to_1st_delay6(int ac_index);
 
 /**
- * \brief Decode pitch delay of the second subframe encoded by 6 bits
+ * @brief Decode pitch delay of the second subframe encoded by 6 bits
  *        with 1/6 precision.
- * \param ac_index adaptive codebook index (6 bits)
- * \param pitch_delay_min lower bound (integer) of pitch delay interval for
+ * @param ac_index adaptive codebook index (6 bits)
+ * @param pitch_delay_min lower bound (integer) of pitch delay interval for
  *                      second subframe
  *
- * \return pitch delay in 1/6 units
+ * @return pitch delay in 1/6 units
  *
  * Pitch delay is coded:
  *    with 1/6 resolution, -6 < pitch_delay - int(prev_pitch_delay) < 5
  *
- * \remark The routine is used in AMR @@12.2k for the second and fourth subframes.
+ * @remark The routine is used in AMR @@12.2k for the second and fourth subframes.
  */
 int ff_acelp_decode_6bit_to_2nd_delay6(
         int ac_index,
         int pitch_delay_min);
 
 /**
- * \brief Update past quantized energies
- * \param[in,out]  quant_energy  past quantized energies (5.10)
- * \param gain_corr_factor gain correction factor
- * \param log2_ma_pred_order log2() of MA prediction order
- * \param erasure frame erasure flag
+ * @brief Update past quantized energies
+ * @param[in,out]  quant_energy  past quantized energies (5.10)
+ * @param gain_corr_factor gain correction factor
+ * @param log2_ma_pred_order log2() of MA prediction order
+ * @param erasure frame erasure flag
  *
  * If frame erasure flag is not equal to zero, memory is updated with
  * averaged energy, attenuated by 4dB:
@@ -128,7 +128,7 @@ int ff_acelp_decode_6bit_to_2nd_delay6(
  * In normal mode memory is updated with
  *     Er - Ep = 20 * log10(gain_corr_factor)
  *
- * \remark The routine is used in G.729 and AMR (all modes).
+ * @remark The routine is used in G.729 and AMR (all modes).
  */
 void ff_acelp_update_past_gain(
         int16_t* quant_energy,
@@ -137,16 +137,16 @@ void ff_acelp_update_past_gain(
         int erasure);
 
 /**
- * \brief Decode the adaptive codebook gain and add
+ * @brief Decode the adaptive codebook gain and add
  *        correction (4.1.5 and 3.9.1 of G.729).
- * \param dsp initialized dsputil context
- * \param gain_corr_factor gain correction factor (2.13)
- * \param fc_v fixed-codebook vector (2.13)
- * \param mr_energy mean innovation energy and fixed-point correction (7.13)
- * \param[in,out]  quant_energy  past quantized energies (5.10)
- * \param subframe_size length of subframe
+ * @param dsp initialized dsputil context
+ * @param gain_corr_factor gain correction factor (2.13)
+ * @param fc_v fixed-codebook vector (2.13)
+ * @param mr_energy mean innovation energy and fixed-point correction (7.13)
+ * @param[in,out]  quant_energy  past quantized energies (5.10)
+ * @param subframe_size length of subframe
  *
- * \return quantized fixed-codebook gain (14.1)
+ * @return quantized fixed-codebook gain (14.1)
  *
  * The routine implements equations 69, 66 and 71 of the G.729 specification (3.9.1)
  *
@@ -205,7 +205,7 @@ void ff_acelp_update_past_gain(
  *
  *        mr_energy = Em + 10log(N) + 10log(2^26)
  *
- * \remark The routine is used in G.729 and AMR (all modes).
+ * @remark The routine is used in G.729 and AMR (all modes).
  */
 int16_t ff_acelp_decode_gain_code(
     DSPContext *dsp,
diff --git a/libavcodec/arm/aac.h b/libavcodec/arm/aac.h
index 3b14c094c6..bd4d293f02 100644
--- a/libavcodec/arm/aac.h
+++ b/libavcodec/arm/aac.h
@@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
              "vmov     d1,  %2,  %3          \n\t"
              "lsls     %6,  %6,  #1          \n\t"
              "and      %0,  %5,  #1<<31      \n\t"
+             "it       cs                    \n\t"
              "lslcs    %5,  %5,  #1          \n\t"
              "lsls     %6,  %6,  #1          \n\t"
              "and      %1,  %5,  #1<<31      \n\t"
+             "it       cs                    \n\t"
              "lslcs    %5,  %5,  #1          \n\t"
              "lsls     %6,  %6,  #1          \n\t"
              "and      %2,  %5,  #1<<31      \n\t"
+             "it       cs                    \n\t"
              "lslcs    %5,  %5,  #1          \n\t"
              "vmov     d4,  %0,  %1          \n\t"
              "and      %3,  %5,  #1<<31      \n\t"
diff --git a/libavcodec/arm/ac3dsp_arm.S b/libavcodec/arm/ac3dsp_arm.S
index 545714cff1..9a7d20eb7b 100644
--- a/libavcodec/arm/ac3dsp_arm.S
+++ b/libavcodec/arm/ac3dsp_arm.S
@@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1
         lsl             r3,  lr,  #1
         ldrh            r12, [r0, r3]
         subs            r2,  r2,  #1
+        it              gt
         ldrbgt          lr,  [r1], #1
         add             r12, r12, #1
         strh            r12, [r0, r3]
diff --git a/libavcodec/arm/ac3dsp_armv6.S b/libavcodec/arm/ac3dsp_armv6.S
index 2b2f2acf22..615baf94e0 100644
--- a/libavcodec/arm/ac3dsp_armv6.S
+++ b/libavcodec/arm/ac3dsp_armv6.S
@@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1
         mov             r11, r10
         ldrb            r10, [r4], #1                   @ band_start_tab[band++]
         subs            r9,  r9,  r5                    @   - floor
+        it              lt
         movlt           r9,  #0
         cmp             r10, r3                         @   - end
         and             r9,  r9,  r8                    @   & 0x1fe0
+        ite             gt
         subgt           r8,  r3,  r11
         suble           r8,  r10, r11
         add             r9,  r9,  r5                    @   + floor => m
diff --git a/libavcodec/arm/ac3dsp_neon.S b/libavcodec/arm/ac3dsp_neon.S
index 946b39f25b..fdf1deabc9 100644
--- a/libavcodec/arm/ac3dsp_neon.S
+++ b/libavcodec/arm/ac3dsp_neon.S
@@ -41,6 +41,7 @@ endfunc
 
 function ff_ac3_exponent_min_neon, export=1
         cmp             r1,  #0
+        it              eq
         bxeq            lr
         push            {lr}
         mov             r12, #256
diff --git a/libavcodec/arm/asm.S b/libavcodec/arm/asm.S
index bb999fd61a..fc7ee60357 100644
--- a/libavcodec/arm/asm.S
+++ b/libavcodec/arm/asm.S
@@ -26,7 +26,16 @@
 #   define ELF @
 #endif
 
+#if CONFIG_THUMB
+#   define A @
+#   define T
+#else
+#   define A
+#   define T @
+#endif
+
         .syntax unified
+T       .thumb
 
 .macro  require8 val=1
 ELF     .eabi_attribute 24, \val
@@ -82,6 +91,90 @@ ELF     .size   \name, . - \name
 #endif
 .endm
 
+.macro  ldr_pre         rt,  rn,  rm:vararg
+A       ldr             \rt, [\rn, \rm]!
+T       add             \rn, \rn, \rm
+T       ldr             \rt, [\rn]
+.endm
+
+.macro  ldr_post        rt,  rn,  rm:vararg
+A       ldr             \rt, [\rn], \rm
+T       ldr             \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  ldrd_reg        rt,  rt2, rn,  rm
+A       ldrd            \rt, \rt2, [\rn, \rm]
+T       add             \rt, \rn, \rm
+T       ldrd            \rt, \rt2, [\rt]
+.endm
+
+.macro  ldrd_post       rt,  rt2, rn,  rm
+A       ldrd            \rt, \rt2, [\rn], \rm
+T       ldrd            \rt, \rt2, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  ldrh_pre        rt,  rn,  rm
+A       ldrh            \rt, [\rn, \rm]!
+T       add             \rn, \rn, \rm
+T       ldrh            \rt, [\rn]
+.endm
+
+.macro  ldrh_dpre       rt,  rn,  rm
+A       ldrh            \rt, [\rn, -\rm]!
+T       sub             \rn, \rn, \rm
+T       ldrh            \rt, [\rn]
+.endm
+
+.macro  ldrh_post       rt,  rn,  rm
+A       ldrh            \rt, [\rn], \rm
+T       ldrh            \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  str_post       rt,  rn,  rm:vararg
+A       str             \rt, [\rn], \rm
+T       str             \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  strb_post       rt,  rn,  rm:vararg
+A       strb            \rt, [\rn], \rm
+T       strb            \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  strd_post       rt,  rt2, rn,  rm
+A       strd            \rt, \rt2, [\rn], \rm
+T       strd            \rt, \rt2, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  strh_pre        rt,  rn,  rm
+A       strh            \rt, [\rn, \rm]!
+T       add             \rn, \rn, \rm
+T       strh            \rt, [\rn]
+.endm
+
+.macro  strh_dpre       rt,  rn,  rm
+A       strh            \rt, [\rn, -\rm]!
+T       sub             \rn, \rn, \rm
+T       strh            \rt, [\rn]
+.endm
+
+.macro  strh_post       rt,  rn,  rm
+A       strh            \rt, [\rn], \rm
+T       strh            \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  strh_dpost       rt,  rn,  rm
+A       strh            \rt, [\rn], -\rm
+T       strh            \rt, [\rn]
+T       sub             \rn, \rn, \rm
+.endm
+
 #if HAVE_VFP_ARGS
         .eabi_attribute 28, 1
 #   define VFP
diff --git a/libavcodec/arm/dcadsp_neon.S b/libavcodec/arm/dcadsp_neon.S
index c3bddd3e41..852527a59e 100644
--- a/libavcodec/arm/dcadsp_neon.S
+++ b/libavcodec/arm/dcadsp_neon.S
@@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1
         add             r5,  r2,  #256*4-16     @ cf1
         sub             r1,  r1,  #12
         cmp             r3,  #32
+        ite             eq
         moveq           r6,  #256/32
         movne           r6,  #256/64
 NOVFP   vldr            s0,  [sp, #16]          @ scale
diff --git a/libavcodec/arm/dsputil_arm.S b/libavcodec/arm/dsputil_arm.S
index 7ee85e808b..c614206bac 100644
--- a/libavcodec/arm/dsputil_arm.S
+++ b/libavcodec/arm/dsputil_arm.S
@@ -554,10 +554,12 @@ endfunc
         and             r9,  r5,  r14
         and             r10, r6,  r14
         and             r11, r7,  r14
+        it              eq
         andeq           r14, r14, r14, \rnd #1
         add             r8,  r8,  r10
         add             r9,  r9,  r11
         ldr             r12, =0xfcfcfcfc >> 2
+        itt             eq
         addeq           r8,  r8,  r14
         addeq           r9,  r9,  r14
         and             r4,  r12, r4,  lsr #2
@@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1
         mvn             r5,  r5
         mvn             r7,  r7
         tst             r6,  #0x100
+        it              ne
         movne           r6,  r5,  lsr #24
         tst             r8,  #0x100
+        it              ne
         movne           r8,  r7,  lsr #24
         mov             r9,  r6
         ldrsh           r5,  [r0, #4]           /* moved form [A] */
@@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1
         mvn             r5,  r5
         mvn             r7,  r7
         tst             r6,  #0x100
+        it              ne
         movne           r6,  r5,  lsr #24
         tst             r8,  #0x100
+        it              ne
         movne           r8,  r7,  lsr #24
         orr             r9,  r9,  r6,  lsl #16
         ldr             r4,  [r1, #4]           /* moved form [B] */
@@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1
         mvn             r5,  r5
         mvn             r7,  r7
         tst             r6,  #0x100
+        it              ne
         movne           r6,  r5,  lsr #24
         tst             r8,  #0x100
+        it              ne
         movne           r8,  r7,  lsr #24
         mov             r9,  r6
         ldrsh           r5,  [r0, #12]          /* moved from [D] */
@@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1
         mvn             r5,  r5
         mvn             r7,  r7
         tst             r6,  #0x100
+        it              ne
         movne           r6,  r5,  lsr #24
         tst             r8,  #0x100
+        it              ne
         movne           r8,  r7,  lsr #24
         orr             r9,  r9,  r6,  lsl #16
         add             r0,  r0,  #16           /* moved from [E] */
diff --git a/libavcodec/arm/dsputil_armv6.S b/libavcodec/arm/dsputil_armv6.S
index 214d947da3..a2c8588fad 100644
--- a/libavcodec/arm/dsputil_armv6.S
+++ b/libavcodec/arm/dsputil_armv6.S
@@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1
         ldr             r5,  [r1, #4]
         ldr             r6,  [r1, #8]
         ldr             r7,  [r1, #12]
-        ldr             r4,  [r1], r2
+        ldr_post        r4,  r1,  r2
         strd            r6,  r7,  [r0, #8]
         ldr             r9,  [r1, #4]
-        strd            r4,  r5,  [r0],  r2
+        strd_post       r4,  r5,  r0,  r2
         ldr             r10, [r1, #8]
         ldr             r11, [r1, #12]
-        ldr             r8,  [r1], r2
+        ldr_post        r8,  r1,  r2
         strd            r10, r11, [r0, #8]
         subs            r3,  r3,  #2
-        strd            r8,  r9,  [r0],  r2
+        strd_post       r8,  r9,  r0,  r2
         bne             1b
 
         pop             {r4-r11}
@@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1
         push            {r4-r7}
 1:
         ldr             r5,  [r1, #4]
-        ldr             r4,  [r1], r2
+        ldr_post        r4,  r1,  r2
         ldr             r7,  [r1, #4]
-        strd            r4,  r5,  [r0],  r2
-        ldr             r6,  [r1], r2
+        strd_post       r4,  r5,  r0,  r2
+        ldr_post        r6,  r1,  r2
         subs            r3,  r3,  #2
-        strd            r6,  r7,  [r0],  r2
+        strd_post       r6,  r7,  r0,  r2
         bne             1b
 
         pop             {r4-r7}
@@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1
         ldr             r5,  [r1, #4]
         ldr             r7,  [r1, #5]
         lsr             r6,  r4,  #8
-        ldr             r8,  [r1, r2]!
+        ldr_pre         r8,  r1,  r2
         orr             r6,  r6,  r5,  lsl #24
         ldr             r9,  [r1, #4]
         ldr             r11, [r1, #5]
@@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1
         uhadd8          r9,  r9,  r11
         and             r6,  r6,  r12
         uadd8           r8,  r8,  r14
-        strd            r4,  r5,  [r0],  r2
+        strd_post       r4,  r5,  r0,  r2
         uadd8           r9,  r9,  r6
-        strd            r8,  r9,  [r0],  r2
+        strd_post       r8,  r9,  r0,  r2
         bne             1b
 
         pop             {r4-r11, pc}
@@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1
         orr             r12, r12, r12, lsl #16
         ldr             r4,  [r1]
         ldr             r5,  [r1, #4]
-        ldr             r6,  [r1, r2]!
+        ldr_pre         r6,  r1,  r2
         ldr             r7,  [r1, #4]
 1:
         subs            r3,  r3,  #2
@@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1
         uhadd8          r9,  r5,  r7
         eor             r11, r5,  r7
         and             r10, r10, r12
-        ldr             r4,  [r1, r2]!
+        ldr_pre         r4,  r1,  r2
         uadd8           r8,  r8,  r10
         and             r11, r11, r12
         uadd8           r9,  r9,  r11
@@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1
         eor             r7,  r5,  r7
         uadd8           r10, r10, r6
         and             r7,  r7,  r12
-        ldr             r6,  [r1, r2]!
+        ldr_pre         r6,  r1,  r2
         uadd8           r11, r11, r7
-        strd            r8,  r9,  [r0],  r2
+        strd_post       r8,  r9,  r0,  r2
         ldr             r7,  [r1, #4]
-        strd            r10, r11, [r0],  r2
+        strd_post       r10, r11, r0,  r2
         bne             1b
 
         pop             {r4-r11}
@@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1
         ldr             r4,  [r1]
         ldr             r5,  [r1, #4]
         ldr             r7,  [r1, #5]
-        ldr             r8,  [r1, r2]!
+        ldr_pre         r8,  r1,  r2
         ldr             r9,  [r1, #4]
         ldr             r14, [r1, #5]
         add             r1,  r1,  r2
@@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1
         push            {r4-r9, lr}
         ldr             r4,  [r1]
         ldr             r5,  [r1, #4]
-        ldr             r6,  [r1, r2]!
+        ldr_pre         r6,  r1,  r2
         ldr             r7,  [r1, #4]
 1:
         subs            r3,  r3,  #2
         uhadd8          r8,  r4,  r6
-        ldr             r4,  [r1, r2]!
+        ldr_pre         r4,  r1,  r2
         uhadd8          r9,  r5,  r7
         ldr             r5,  [r1, #4]
         uhadd8          r12, r4,  r6
-        ldr             r6,  [r1, r2]!
+        ldr_pre         r6,  r1,  r2
         uhadd8          r14, r5,  r7
         ldr             r7,  [r1, #4]
         stm             r0,  {r8,r9}
@@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1
         orr             lr,  lr,  lr,  lsl #16
         ldrd            r4,  r5,  [r0]
         ldr             r10, [r1, #4]
-        ldr             r9,  [r1], r2
+        ldr_post        r9,  r1,  r2
         subs            r3,  r3,  #2
 1:
         pld             [r1, r2]
         eor             r8,  r4,  r9
         uhadd8          r4,  r4,  r9
         eor             r12, r5,  r10
-        ldrd            r6,  r7,  [r0, r2]
+        ldrd_reg        r6,  r7,  r0,  r2
         uhadd8          r5,  r5,  r10
         and             r8,  r8,  lr
         ldr             r10, [r1, #4]
         and             r12, r12, lr
         uadd8           r4,  r4,  r8
-        ldr             r9,  [r1], r2
+        ldr_post        r9,  r1,  r2
         eor             r8,  r6,  r9
         uadd8           r5,  r5,  r12
         pld             [r1, r2,  lsl #1]
         eor             r12, r7,  r10
         uhadd8          r6,  r6,  r9
-        strd            r4,  r5,  [r0], r2
+        strd_post       r4,  r5,  r0,  r2
         uhadd8          r7,  r7,  r10
         beq             2f
         and             r8,  r8,  lr
-        ldrd            r4,  r5,  [r0, r2]
+        ldrd_reg        r4,  r5,  r0,  r2
         uadd8           r6,  r6,  r8
         ldr             r10, [r1, #4]
         and             r12, r12, lr
         subs            r3,  r3,  #2
         uadd8           r7,  r7,  r12
-        ldr             r9,  [r1], r2
-        strd            r6,  r7,  [r0], r2
+        ldr_post        r9,  r1,  r2
+        strd_post       r6,  r7,  r0,  r2
         b               1b
 2:
         and             r8,  r8,  lr
         and             r12, r12, lr
         uadd8           r6,  r6,  r8
         uadd8           r7,  r7,  r12
-        strd            r6,  r7,  [r0], r2
+        strd_post       r6,  r7,  r0,  r2
 
         pop             {r4-r10, pc}
 endfunc
@@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1
         orr             r6,  r8,  r5,  lsl #8
         orr             r7,  r4,  lr,  lsl #8
         subs            r3,  r3,  #1
-        strd            r6,  r7,  [r1],  r2
+        strd_post       r6,  r7,  r1,  r2
         bgt             1b
         pop             {r4-r8,pc}
 endfunc
@@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1
         push            {r4-r8, lr}
         mov             lr,  #8
 1:
-        ldrd            r4,  r5,  [r1],  r2
+        ldrd_post       r4,  r5,  r1,  r2
         subs            lr,  lr,  #1
         uxtb16          r6,  r4
         uxtb16          r4,  r4,  ror #8
@@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1
         push            {r4-r9, lr}
         mov             lr,  #8
 1:
-        ldrd            r4,  r5,  [r1],  r3
-        ldrd            r6,  r7,  [r2],  r3
+        ldrd_post       r4,  r5,  r1,  r3
+        ldrd_post       r6,  r7,  r2,  r3
         uxtb16          r8,  r4
         uxtb16          r4,  r4,  ror #8
         uxtb16          r9,  r6
@@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1
         push            {r4-r9, lr}
         mov             r0,  #0
         mov             lr,  #0
-        ldrd            r4,  r5,  [r1], r3
+        ldrd_post       r4,  r5,  r1,  r3
 1:
         subs            r12, r12, #2
         ldr             r7,  [r2, #4]
-        ldr             r6,  [r2], r3
-        ldrd            r8,  r9,  [r1], r3
+        ldr_post        r6,  r2,  r3
+        ldrd_post       r8,  r9,  r1,  r3
         usada8          r0,  r4,  r6,  r0
         pld             [r2, r3]
         usada8          lr,  r5,  r7,  lr
         ldr             r7,  [r2, #4]
-        ldr             r6,  [r2], r3
+        ldr_post        r6,  r2,  r3
         beq             2f
-        ldrd            r4,  r5,  [r1], r3
+        ldrd_post       r4,  r5,  r1,  r3
         usada8          r0,  r8,  r6,  r0
         pld             [r2, r3]
         usada8          lr,  r9,  r7,  lr
@@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1
         ldr             r7,  [r0, #12]
         usada8          r2,  r6,  lr,  r2
         beq             2f
-        ldr             r4,  [r0, r1]!
+        ldr_pre         r4,  r0,  r1
         usada8          r3,  r7,  lr,  r3
         bgt             1b
 2:
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index 0dbf5ca48a..2147658af6 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1
 
 2:      vst1.32         {d2-d3},  [r3, :128]!
         vst1.32         {d0-d1},  [r12,:128]!
+        it              lt
         bxlt            lr
 
 3:      vld1.32         {d2-d3},  [r1,:128]
@@ -575,6 +576,7 @@ NOVFP   vdup.32         q8,  r2
 2:      vst1.32         {q2},[r0,:128]!
         vst1.32         {q3},[r0,:128]!
         ands            len, len, #15
+        it              eq
         bxeq            lr
 3:      vld1.32         {q0},[r1,:128]!
         vmul.f32        q0,  q0,  q8
@@ -638,6 +640,7 @@ NOVFP   ldr             r3,  [sp]
 2:      vst1.32         {q8},[r0,:128]!
         vst1.32         {q9},[r0,:128]!
         ands            r3,  r3,  #7
+        it              eq
         popeq           {pc}
 3:      vld1.32         {q0},[r1,:128]!
         ldr             r12, [r2], #4
diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S
index 497c02be92..108208174d 100644
--- a/libavcodec/arm/dsputil_vfp.S
+++ b/libavcodec/arm/dsputil_vfp.S
@@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1
 1:
         subs            r3,  r3,  #16
         vmul.f32        s12, s4,  s12
+        itttt           ge
         vldmiage        r1!, {s16-s19}
         vldmiage        r2!, {s24-s27}
         vldmiage        r1!, {s20-s23}
         vldmiage        r2!, {s28-s31}
+        it              ge
         vmulge.f32      s24, s16, s24
         vstmia          r0!, {s8-s11}
         vstmia          r0!, {s12-s15}
+        it              ge
         vmulge.f32      s28, s20, s28
+        itttt           gt
         vldmiagt        r1!, {s0-s3}
         vldmiagt        r2!, {s8-s11}
         vldmiagt        r1!, {s4-s7}
         vldmiagt        r2!, {s12-s15}
+        ittt            ge
         vmulge.f32      s8,  s0,  s8
         vstmiage        r0!, {s24-s27}
         vstmiage        r0!, {s28-s31}
@@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1
         vmul.f32        s11, s0,  s11
 1:
         subs            r3,  r3,  #16
+        it              ge
         vldmdbge        r2!, {s16-s19}
         vmul.f32        s12, s7,  s12
+        it              ge
         vldmiage        r1!, {s24-s27}
         vmul.f32        s13, s6,  s13
+        it              ge
         vldmdbge        r2!, {s20-s23}
         vmul.f32        s14, s5,  s14
+        it              ge
         vldmiage        r1!, {s28-s31}
         vmul.f32        s15, s4,  s15
+        it              ge
         vmulge.f32      s24, s19, s24
+        it              gt
         vldmdbgt        r2!, {s0-s3}
+        it              ge
         vmulge.f32      s25, s18, s25
         vstmia          r0!, {s8-s13}
+        it              ge
         vmulge.f32      s26, s17, s26
+        it              gt
         vldmiagt        r1!, {s8-s11}
+        itt             ge
         vmulge.f32      s27, s16, s27
         vmulge.f32      s28, s23, s28
+        it              gt
         vldmdbgt        r2!, {s4-s7}
+        it              ge
         vmulge.f32      s29, s22, s29
         vstmia          r0!, {s14-s15}
+        ittt            ge
         vmulge.f32      s30, s21, s30
         vmulge.f32      s31, s20, s31
         vmulge.f32      s8,  s3,  s8
+        it              gt
         vldmiagt        r1!, {s12-s15}
+        itttt           ge
         vmulge.f32      s9,  s2,  s9
         vmulge.f32      s10, s1,  s10
         vstmiage        r0!, {s24-s27}
         vmulge.f32      s11, s0,  s11
+        it              ge
         vstmiage        r0!, {s28-s31}
         bgt             1b
 
diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S
index 359e57e40b..d1ad32ed27 100644
--- a/libavcodec/arm/fmtconvert_neon.S
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -71,6 +71,7 @@ endfunc
 
 function ff_float_to_int16_interleave_neon, export=1
         cmp             r3, #2
+        itt             lt
         ldrlt           r1, [r1]
         blt             ff_float_to_int16_neon
         bne             4f
@@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1
         vst1.64         {d3},     [r8], ip
         vst1.64         {d7},     [r8], ip
         subs            r3,  r3,  #4
+        it              eq
         popeq           {r4-r8,pc}
         cmp             r3,  #4
         add             r0,  r0,  #8
@@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1
         vst1.32         {d23[1]}, [r8], ip
 8:      subs            r3,  r3,  #2
         add             r0,  r0,  #4
+        it              eq
         popeq           {r4-r8,pc}
 
         @ 1 channel
@@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1
         vst1.16         {d2[3]},  [r5,:16], ip
         vst1.16         {d3[1]},  [r5,:16], ip
         vst1.16         {d3[3]},  [r5,:16], ip
+        it              eq
         popeq           {r4-r8,pc}
         vld1.64         {d0-d1},  [r4,:128]!
         vcvt.s32.f32    q0,  q0,  #16
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
index da2ef8c158..7e2eb83620 100644
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1
         vmov            r5,  r6,  s2, s3
         vmov            r7,  r8,  s4, s5
         vmov            ip,  lr,  s6, s7
+        it              gt
         vldmiagt        r1!, {s16-s23}
         ssat            r4,  #16, r4
         ssat            r3,  #16, r3
@@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1
         ssat            r5,  #16, r5
         pkhbt           r3,  r3,  r4, lsl #16
         pkhbt           r4,  r5,  r6, lsl #16
+        itttt           gt
         vcvtgt.s32.f32  s0,  s16
         vcvtgt.s32.f32  s1,  s17
         vcvtgt.s32.f32  s2,  s18
         vcvtgt.s32.f32  s3,  s19
+        itttt           gt
         vcvtgt.s32.f32  s4,  s20
         vcvtgt.s32.f32  s5,  s21
         vcvtgt.s32.f32  s6,  s22
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index bd15ced736..338de6f643 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -71,7 +71,9 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
         pld             [r1]
         pld             [r1, r2]
 
-        muls            r7,  r4,  r5
+A       muls            r7,  r4,  r5
+T       mul             r7,  r4,  r5
+T       cmp             r7,  #0
         rsb             r6,  r7,  r5,  lsl #3
         rsb             ip,  r7,  r4,  lsl #3
         sub             r4,  r7,  r4,  lsl #3
@@ -197,7 +199,9 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
         pld             [r1]
         pld             [r1, r2]
 
-        muls            r7,  r4,  r5
+A       muls            r7,  r4,  r5
+T       mul             r7,  r4,  r5
+T       cmp             r7,  #0
         rsb             r6,  r7,  r5,  lsl #3
         rsb             ip,  r7,  r4,  lsl #3
         sub             r4,  r7,  r4,  lsl #3
@@ -368,10 +372,10 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
         pop             {r4-r6, pc}
 2:
 .ifc \type,put
-        ldrh            r5,  [r1], r2
-        strh            r5,  [r0], r2
-        ldrh            r6,  [r1], r2
-        strh            r6,  [r0], r2
+        ldrh_post       r5,  r1,  r2
+        strh_post       r5,  r0,  r2
+        ldrh_post       r6,  r1,  r2
+        strh_post       r6,  r0,  r2
 .else
         vld1.16         {d16[0]}, [r1], r2
         vld1.16         {d16[1]}, [r1], r2
@@ -404,28 +408,17 @@ endfunc
         ldr             ip,  [sp]
         tst             r2,  r2
         ldr             ip,  [ip]
+        it              ne
         tstne           r3,  r3
         vmov.32         d24[0], ip
         and             ip,  ip,  ip, lsl #16
+        it              eq
         bxeq            lr
         ands            ip,  ip,  ip, lsl #8
+        it              lt
         bxlt            lr
         .endm
 
-        .macro align_push_regs
-        and             ip,  sp,  #15
-        add             ip,  ip,  #32
-        sub             sp,  sp,  ip
-        vst1.64         {d12-d15}, [sp,:128]
-        sub             sp,  sp,  #32
-        vst1.64         {d8-d11},  [sp,:128]
-        .endm
-
-        .macro align_pop_regs
-        vld1.64         {d8-d11},  [sp,:128]!
-        vld1.64         {d12-d15}, [sp,:128], ip
-        .endm
-
         .macro h264_loop_filter_luma
         vdup.8          q11, r2         @ alpha
         vmovl.u8        q12, d24
@@ -506,7 +499,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
         vld1.64         {d18,d19}, [r0,:128], r1
         vld1.64         {d16,d17}, [r0,:128], r1
 
-        align_push_regs
+        vpush           {d8-d15}
 
         h264_loop_filter_luma
 
@@ -516,7 +509,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
         vst1.64         {d0, d1},  [r0,:128], r1
         vst1.64         {d10,d11}, [r0,:128]
 
-        align_pop_regs
+        vpop            {d8-d15}
         bx              lr
 endfunc
 
@@ -543,7 +536,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1
 
         transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
 
-        align_push_regs
+        vpush           {d8-d15}
 
         h264_loop_filter_luma
 
@@ -568,7 +561,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1
         vst1.32         {d1[1]},  [r0], r1
         vst1.32         {d11[1]}, [r0], r1
 
-        align_pop_regs
+        vpop            {d8-d15}
         bx              lr
 endfunc
 
@@ -1116,6 +1109,7 @@ function \type\()_h264_qpel8_hv_lowpass_neon
         vrhadd.u8       d11, d11, d7
         sub             r0,  r0,  r2,  lsl #3
 .endif
+
         vst1.64         {d12},     [r0,:64], r2
         vst1.64         {d13},     [r0,:64], r2
         vst1.64         {d14},     [r0,:64], r2
@@ -1263,7 +1257,9 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1
 \type\()_h264_qpel8_mc11:
         lowpass_const   r3
         mov             r11, sp
-        bic             sp,  sp,  #15
+A       bic             sp,  sp,  #15
+T       bic             r0,  r11, #15
+T       mov             sp,  r0
         sub             sp,  sp,  #64
         mov             r0,  sp
         sub             r1,  r1,  #2
@@ -1271,14 +1267,14 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1
         mov             ip,  #8
         vpush           {d8-d15}
         bl              put_h264_qpel8_h_lowpass_neon
-        ldrd            r0,  [r11]
+        ldrd            r0,  [r11], #8
         mov             r3,  r2
         add             ip,  sp,  #64
         sub             r1,  r1,  r2, lsl #1
         mov             r2,  #8
         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         vpop            {d8-d15}
-        add             sp,  r11, #8
+        mov             sp,  r11
         pop             {r11, pc}
 endfunc
 
@@ -1287,7 +1283,9 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1
 \type\()_h264_qpel8_mc21:
         lowpass_const   r3
         mov             r11, sp
-        bic             sp,  sp,  #15
+A       bic             sp,  sp,  #15
+T       bic             r0,  r11, #15
+T       mov             sp,  r0
         sub             sp,  sp,  #(8*8+16*12)
         sub             r1,  r1,  #2
         mov             r3,  #8
@@ -1296,14 +1294,14 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1
         vpush           {d8-d15}
         bl              put_h264_qpel8_h_lowpass_neon
         mov             r4,  r0
-        ldrd            r0,  [r11]
+        ldrd            r0,  [r11], #8
         sub             r1,  r1,  r2, lsl #1
         sub             r1,  r1,  #2
         mov             r3,  r2
         sub             r2,  r4,  #64
         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         vpop            {d8-d15}
-        add             sp,  r11,  #8
+        mov             sp,  r11
         pop             {r4, r10, r11, pc}
 endfunc
 
@@ -1330,7 +1328,9 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1
 \type\()_h264_qpel8_mc12:
         lowpass_const   r3
         mov             r11, sp
-        bic             sp,  sp,  #15
+A       bic             sp,  sp,  #15
+T       bic             r0,  r11, #15
+T       mov             sp,  r0
         sub             sp,  sp,  #(8*8+16*12)
         sub             r1,  r1,  r2, lsl #1
         mov             r3,  r2
@@ -1339,20 +1339,22 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1
         vpush           {d8-d15}
         bl              put_h264_qpel8_v_lowpass_neon
         mov             r4,  r0
-        ldrd            r0,  [r11]
+        ldrd            r0,  [r11], #8
         sub             r1,  r1,  r3, lsl #1
         sub             r1,  r1,  #2
         sub             r2,  r4,  #64
         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         vpop            {d8-d15}
-        add             sp,  r11,  #8
+        mov             sp,  r11
         pop             {r4, r10, r11, pc}
 endfunc
 
 function ff_\type\()_h264_qpel8_mc22_neon, export=1
         push            {r4, r10, r11, lr}
         mov             r11, sp
-        bic             sp,  sp,  #15
+A       bic             sp,  sp,  #15
+T       bic             r4,  r11, #15
+T       mov             sp,  r4
         sub             r1,  r1,  r2, lsl #1
         sub             r1,  r1,  #2
         mov             r3,  r2
@@ -1441,21 +1443,23 @@ function ff_\type\()_h264_qpel16_mc11_neon, export=1
 \type\()_h264_qpel16_mc11:
         lowpass_const   r3
         mov             r11, sp
-        bic             sp,  sp,  #15
+A       bic             sp,  sp,  #15
+T       bic             r0,  r11, #15
+T       mov             sp,  r0
         sub             sp,  sp,  #256
         mov             r0,  sp
         sub             r1,  r1,  #2
         mov             r3,  #16
         vpush           {d8-d15}
         bl              put_h264_qpel16_h_lowpass_neon
-        ldrd            r0,  [r11]
+        ldrd            r0,  [r11], #8
         mov             r3,  r2
         add             ip,  sp,  #64
         sub             r1,  r1,  r2, lsl #1
         mov             r2,  #16
         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
         vpop            {d8-d15}
-        add             sp,  r11, #8
+        mov             sp,  r11
         pop             {r4, r11, pc}
 endfunc
 
@@ -1464,20 +1468,22 @@ function ff_\type\()_h264_qpel16_mc21_neon, export=1
 \type\()_h264_qpel16_mc21:
         lowpass_const   r3
         mov             r11, sp
-        bic             sp,  sp,  #15
+A       bic             sp,  sp,  #15
+T       bic             r0,  r11, #15
+T       mov             sp,  r0
         sub             sp,  sp,  #(16*16+16*12)
         sub             r1,  r1,  #2
         mov             r0,  sp
         vpush           {d8-d15}
         bl              put_h264_qpel16_h_lowpass_neon_packed
         mov             r4,  r0
-        ldrd            r0,  [r11]
+        ldrd            r0,  [r11], #8
         sub             r1,  r1,  r2, lsl #1
         sub             r1,  r1,  #2
         mov             r3,  r2
         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
         vpop            {d8-d15}
-        add             sp,  r11,  #8
+        mov             sp,  r11
         pop             {r4-r5, r9-r11, pc}
 endfunc
 
@@ -1504,7 +1510,9 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1
 \type\()_h264_qpel16_mc12:
         lowpass_const   r3
         mov             r11, sp
-        bic             sp,  sp,  #15
+A       bic             sp,  sp,  #15
+T       bic             r0,  r11, #15
+T       mov             sp,  r0
         sub             sp,  sp,  #(16*16+16*12)
         sub             r1,  r1,  r2, lsl #1
         mov             r0,  sp
@@ -1512,13 +1520,13 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1
         vpush           {d8-d15}
         bl              put_h264_qpel16_v_lowpass_neon_packed
         mov             r4,  r0
-        ldrd            r0,  [r11]
+        ldrd            r0,  [r11], #8
         sub             r1,  r1,  r3, lsl #1
         sub             r1,  r1,  #2
         mov             r2,  r3
         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
         vpop            {d8-d15}
-        add             sp,  r11,  #8
+        mov             sp,  r11
         pop             {r4-r5, r9-r11, pc}
 endfunc
 
@@ -1526,7 +1534,9 @@ function ff_\type\()_h264_qpel16_mc22_neon, export=1
         push            {r4, r9-r11, lr}
         lowpass_const   r3
         mov             r11, sp
-        bic             sp,  sp,  #15
+A       bic             sp,  sp,  #15
+T       bic             r4,  r11, #15
+T       mov             sp,  r4
         sub             r1,  r1,  r2, lsl #1
         sub             r1,  r1,  #2
         mov             r3,  r2
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index afd3718518..6ea56587b8 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1
         blt             2f
         ldrsh           lr,  [r1]
         add             r0,  r0,  r4
+        it              ne
         movne           lr,  #0
         cmp             lr,  #0
-        adrne           lr,  ff_h264_idct_dc_add_neon
-        adreq           lr,  ff_h264_idct_add_neon
+        ite             ne
+        adrne           lr,  ff_h264_idct_dc_add_neon + CONFIG_THUMB
+        adreq           lr,  ff_h264_idct_add_neon    + CONFIG_THUMB
         blx             lr
 2:      subs            ip,  ip,  #1
         add             r1,  r1,  #32
@@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1
         add             r0,  r0,  r4
         cmp             r8,  #0
         ldrsh           r8,  [r1]
-        adrne           lr,  ff_h264_idct_add_neon
-        adreq           lr,  ff_h264_idct_dc_add_neon
+        iteet           ne
+        adrne           lr,  ff_h264_idct_add_neon    + CONFIG_THUMB
+        adreq           lr,  ff_h264_idct_dc_add_neon + CONFIG_THUMB
         cmpeq           r8,  #0
         blxne           lr
         subs            ip,  ip,  #1
@@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1
         add             r1,  r3,  r12, lsl #5
         cmp             r8,  #0
         ldrsh           r8,  [r1]
-        adrne           lr,  ff_h264_idct_add_neon
-        adreq           lr,  ff_h264_idct_dc_add_neon
+        iteet           ne
+        adrne           lr,  ff_h264_idct_add_neon    + CONFIG_THUMB
+        adreq           lr,  ff_h264_idct_dc_add_neon + CONFIG_THUMB
         cmpeq           r8,  #0
         blxne           lr
         add             r12, r12, #1
         cmp             r12, #4
+        itt             eq
         moveq           r12, #16
         moveq           r4,  r9
         cmp             r12, #20
@@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1
         blt             2f
         ldrsh           lr,  [r1]
         add             r0,  r0,  r4
+        it              ne
         movne           lr,  #0
         cmp             lr,  #0
-        adrne           lr,  ff_h264_idct8_dc_add_neon
-        adreq           lr,  ff_h264_idct8_add_neon
+        ite             ne
+        adrne           lr,  ff_h264_idct8_dc_add_neon + CONFIG_THUMB
+        adreq           lr,  ff_h264_idct8_add_neon    + CONFIG_THUMB
         blx             lr
 2:      subs            r12, r12, #4
         add             r1,  r1,  #128
diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h
index 299a973cb6..d67714c496 100644
--- a/libavcodec/arm/mathops.h
+++ b/libavcodec/arm/mathops.h
@@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c)
     __asm__ (
         "mov   %0, %2  \n\t"
         "cmp   %1, %2  \n\t"
+        "itt   gt      \n\t"
         "movgt %0, %1  \n\t"
         "movgt %1, %2  \n\t"
         "cmp   %1, %3  \n\t"
+        "it    le      \n\t"
         "movle %1, %3  \n\t"
         "cmp   %0, %1  \n\t"
+        "it    gt      \n\t"
         "movgt %0, %1  \n\t"
         : "=&r"(m), "+r"(a)
         : "r"(b), "r"(c)
diff --git a/libavcodec/arm/mdct_neon.S b/libavcodec/arm/mdct_neon.S
index fcf802275f..2def704497 100644
--- a/libavcodec/arm/mdct_neon.S
+++ b/libavcodec/arm/mdct_neon.S
@@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1
         vadd.f32        d17, d17, d3            @ in2u+in1d     -I
 1:
         vmul.f32        d7,  d0,  d21           @  I*s
-        ldr             r10, [r3, lr, lsr #1]
+A       ldr             r10, [r3, lr, lsr #1]
+T       lsr             r10, lr,  #1
+T       ldr             r10, [r3, r10]
         vmul.f32        d6,  d1,  d20           @ -R*c
         ldr             r6,  [r3, #4]!
         vmul.f32        d4,  d1,  d21           @ -R*s
diff --git a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
index 9ec731480b..b517b973e7 100644
--- a/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
+++ b/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
@@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
         sum8            r8,  r9,  r1,  r0,  r10, r11, r12, lr
         sum8            r8,  r9,  r1,  r2,  r10, r11, r12, lr, rsb, 32
         round           r10, r8,  r9
-        strh            r10, [r3], r4
+        strh_post       r10, r3,  r4
 
         mov             lr,  #15
 1:
@@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
         round           r10, r8,  r9
         adds            r8,  r8,  r4
         adc             r9,  r9,  r7
-        strh            r10, [r3], r12
+        strh_post       r10, r3,  r12
         round           r11, r8,  r9
         subs            lr,  lr,  #1
-        strh            r11, [r5], -r12
+        strh_dpost      r11, r5, r12
         bgt             1b
 
         sum8            r8,  r9,  r1,  r0,  r10, r11, r12, lr, rsb, 33
diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S
index 82095ab15d..3db9c734e9 100644
--- a/libavcodec/arm/mpegvideo_armv5te_s.S
+++ b/libavcodec/arm/mpegvideo_armv5te_s.S
@@ -38,15 +38,21 @@
 
 .macro  dequant_t       dst, src, mul, add, tmp
         rsbs            \tmp, ip, \src, asr #16
+        it              gt
         addgt           \tmp, \add, #0
+        it              lt
         rsblt           \tmp, \add, #0
+        it              ne
         smlatbne        \dst, \src, \mul, \tmp
 .endm
 
 .macro  dequant_b       dst, src, mul, add, tmp
         rsbs            \tmp, ip, \src, lsl #16
+        it              gt
         addgt           \tmp, \add, #0
+        it              lt
         rsblt           \tmp, \add, #0
+        it              ne
         smlabbne        \dst, \src, \mul, \tmp
 .endm
 
@@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1
         strh            lr, [r0], #2
 
         subs            r3, r3, #8
+        it              gt
         ldrdgt          r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
         bgt             1b
 
         adds            r3, r3, #2
+        it              le
         pople           {r4-r9,pc}
 2:
         ldrsh           r9, [r0, #0]
         ldrsh           lr, [r0, #2]
         mov             r8, r2
         cmp             r9, #0
+        it              lt
         rsblt           r8, r2, #0
+        it              ne
         smlabbne        r9, r9, r1, r8
         mov             r8, r2
         cmp             lr, #0
+        it              lt
         rsblt           r8, r2, #0
+        it              ne
         smlabbne        lr, lr, r1, r8
         strh            r9, [r0], #2
         strh            lr, [r0], #2
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index b695fb7c22..849047e13c 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1
         subs            r3,  r3,  #16
         vst1.16         {q0},     [r1,:128]!
         vst1.16         {q8},     [r1,:128]!
+        it              le
         bxle            lr
         cmp             r3,  #8
         bgt             1b
@@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1
         ldr             r6,  [r0, #AC_PRED]
         add             lr,  r0,  #INTER_SCANTAB_RASTER_END
         cmp             r6,  #0
+        it              ne
         movne           r12, #63
         bne             1f
         ldr             r12, [r12, r2, lsl #2]
@@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1
         ldrsh           r4,  [r1]
         cmp             r5,  #0
         mov             r5,  r1
+        it              ne
         movne           r2,  #0
         bne             2f
         cmp             r2,  #4
+        it              ge
         addge           r0,  r0,  #4
         sub             r2,  r3,  #1
         ldr             r6,  [r0, #Y_DC_SCALE]
diff --git a/libavcodec/arm/rdft_neon.S b/libavcodec/arm/rdft_neon.S
index 4f8a1032cc..19886e6d0b 100644
--- a/libavcodec/arm/rdft_neon.S
+++ b/libavcodec/arm/rdft_neon.S
@@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1
         vst1.32         {d22},    [r5,:64]
 
         cmp             r6,  #0
+        it              eq
         popeq           {r4-r8,pc}
 
         vmul.f32        d22, d22, d18
diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S
index ecb83d23ad..990dde6ff7 100644
--- a/libavcodec/arm/simple_idct_arm.S
+++ b/libavcodec/arm/simple_idct_arm.S
@@ -121,11 +121,13 @@ __b_evaluation:
         ldr r11, [r12, #offW7]   @ R11=W7
         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
-                teq r2, #0               @ if null avoid muls
-                mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        teq r2, #0               @ if null avoid muls
+        itttt ne
+        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
         rsbne r2, r2, #0         @ R2=-ROWr16[3]
         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        it    ne
         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 
         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
@@ -148,19 +150,23 @@ __b_evaluation:
         @@ MAC16(b3, -W1, row[7]);
         @@ MAC16(b1, -W5, row[7]);
         mov r3, r3, asr #16      @ R3=ROWr16[5]
-                teq r3, #0               @ if null avoid muls
+        teq r3, #0               @ if null avoid muls
+        it    ne
         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
         mov r4, r4, asr #16      @ R4=ROWr16[7]
+        itttt ne
         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
         rsbne r3, r3, #0         @ R3=-ROWr16[5]
         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
         @@ R3 is free now
-                teq r4, #0               @ if null avoid muls
+        teq r4, #0               @ if null avoid muls
+        itttt ne
         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
         rsbne r4, r4, #0         @ R4=-ROWr16[7]
         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
+        it    ne
         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
         @@ R4 is free now
 __end_b_evaluation:
@@ -204,16 +210,19 @@ __a_evaluation:
         @@ a2 -= W4*row[4]
         @@ a3 += W4*row[4]
         ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
-                teq r11, #0              @ if null avoid muls
+        teq r11, #0              @ if null avoid muls
+        it    ne
         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
         @@ R9 is free now
         ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
+        itttt ne
         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
-                teq r9, #0               @ if null avoid muls
+        teq r9, #0               @ if null avoid muls
+        itttt ne
         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
@@ -222,6 +231,7 @@ __a_evaluation:
         @@ a1 -= W2*row[6];
         @@ a2 += W2*row[6];
         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
+        itt   ne
         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 
@@ -323,10 +333,12 @@ __b_evaluation2:
         ldrsh r2, [r14, #48]
         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
         teq r2, #0               @ if 0, then avoid muls
+        itttt ne
         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
         rsbne r2, r2, #0         @ R2=-ROWr16[3]
         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        it    ne
         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 
         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
@@ -342,18 +354,22 @@ __b_evaluation2:
         @@ MAC16(b1, -W5, col[7x8]);
         ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
         teq r3, #0               @ if 0 then avoid muls
+        itttt ne
         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
         rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
         ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
+        it    ne
         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
         @@ R3 is free now
         teq r4, #0               @ if 0 then avoid muls
+        itttt ne
         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
         rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
+        it    ne
         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
         @@ R4 is free now
 __end_b_evaluation2:
@@ -390,15 +406,18 @@ __a_evaluation2:
         @@ a3 += W4*row[4]
         ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
         teq r11, #0              @ if null avoid muls
+        itttt ne
         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
         @@ R9 is free now
         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
         ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
+        it    ne
         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
         teq r9, #0               @ if null avoid muls
+        itttt ne
         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
@@ -407,6 +426,7 @@ __a_evaluation2:
         @@ a1 -= W2*row[6];
         @@ a2 += W2*row[6];
         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
+        itt   ne
         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 __end_a_evaluation2:
diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S
index 3c4b5c06d1..71727ceccc 100644
--- a/libavcodec/arm/simple_idct_armv5te.S
+++ b/libavcodec/arm/simple_idct_armv5te.S
@@ -49,6 +49,7 @@ function idct_row_armv5te
         ldrd   v1, [a1, #8]
         ldrd   a3, [a1]              /* a3 = row[1:0], a4 = row[3:2] */
         orrs   v1, v1, v2
+        itt    eq
         cmpeq  v1, a4
         cmpeq  v1, a3, lsr #16
         beq    row_dc_only
@@ -269,6 +270,7 @@ function idct_col_armv5te
         ldmfd  sp!, {a3, a4}
         adds   a2, a3, v1
         mov    a2, a2, lsr #20
+        it     mi
         orrmi  a2, a2, #0xf000
         add    ip, a4, v2
         mov    ip, ip, asr #20
@@ -276,6 +278,7 @@ function idct_col_armv5te
         str    a2, [a1]
         subs   a3, a3, v1
         mov    a2, a3, lsr #20
+        it     mi
         orrmi  a2, a2, #0xf000
         sub    a4, a4, v2
         mov    a4, a4, asr #20
@@ -285,6 +288,7 @@ function idct_col_armv5te
 
         subs   a2, a3, v3
         mov    a2, a2, lsr #20
+        it     mi
         orrmi  a2, a2, #0xf000
         sub    ip, a4, v4
         mov    ip, ip, asr #20
@@ -292,6 +296,7 @@ function idct_col_armv5te
         str    a2, [a1, #(16*1)]
         adds   a3, a3, v3
         mov    a2, a3, lsr #20
+        it     mi
         orrmi  a2, a2, #0xf000
         add    a4, a4, v4
         mov    a4, a4, asr #20
@@ -301,6 +306,7 @@ function idct_col_armv5te
 
         adds   a2, a3, v5
         mov    a2, a2, lsr #20
+        it     mi
         orrmi  a2, a2, #0xf000
         add    ip, a4, v6
         mov    ip, ip, asr #20
@@ -308,6 +314,7 @@ function idct_col_armv5te
         str    a2, [a1, #(16*2)]
         subs   a3, a3, v5
         mov    a2, a3, lsr #20
+        it     mi
         orrmi  a2, a2, #0xf000
         sub    a4, a4, v6
         mov    a4, a4, asr #20
@@ -317,6 +324,7 @@ function idct_col_armv5te
 
         adds   a2, a3, v7
         mov    a2, a2, lsr #20
+        it     mi
         orrmi  a2, a2, #0xf000
         add    ip, a4, fp
         mov    ip, ip, asr #20
@@ -324,6 +332,7 @@ function idct_col_armv5te
         str    a2, [a1, #(16*3)]
         subs   a3, a3, v7
         mov    a2, a3, lsr #20
+        it     mi
         orrmi  a2, a2, #0xf000
         sub    a4, a4, fp
         mov    a4, a4, asr #20
@@ -335,15 +344,19 @@ endfunc
 
 .macro  clip   dst, src:vararg
         movs   \dst, \src
+        it     mi
         movmi  \dst, #0
         cmp    \dst, #255
+        it     gt
         movgt  \dst, #255
 .endm
 
 .macro  aclip  dst, src:vararg
         adds   \dst, \src
+        it     mi
         movmi  \dst, #0
         cmp    \dst, #255
+        it     gt
         movgt  \dst, #255
 .endm
 
@@ -370,35 +383,35 @@ function idct_col_put_armv5te
         orr    a2, a3, a4, lsl #8
         rsb    v2, lr, lr, lsl #3
         ldmfd  sp!, {a3, a4}
-        strh   a2, [v2, v1]!
+        strh_pre a2, v2, v1
 
         sub    a2, a3, v3
         clip   a2, a2, asr #20
         sub    ip, a4, v4
         clip   ip, ip, asr #20
         orr    a2, a2, ip, lsl #8
-        strh   a2, [v1, lr]!
+        strh_pre a2, v1, lr
         add    a3, a3, v3
         clip   a2, a3, asr #20
         add    a4, a4, v4
         clip   a4, a4, asr #20
         orr    a2, a2, a4, lsl #8
         ldmfd  sp!, {a3, a4}
-        strh   a2, [v2, -lr]!
+        strh_dpre a2, v2, lr
 
         add    a2, a3, v5
         clip   a2, a2, asr #20
         add    ip, a4, v6
         clip   ip, ip, asr #20
         orr    a2, a2, ip, lsl #8
-        strh   a2, [v1, lr]!
+        strh_pre a2, v1, lr
         sub    a3, a3, v5
         clip   a2, a3, asr #20
         sub    a4, a4, v6
         clip   a4, a4, asr #20
         orr    a2, a2, a4, lsl #8
         ldmfd  sp!, {a3, a4}
-        strh   a2, [v2, -lr]!
+        strh_dpre a2, v2, lr
 
         add    a2, a3, v7
         clip   a2, a2, asr #20
@@ -411,7 +424,7 @@ function idct_col_put_armv5te
         sub    a4, a4, fp
         clip   a4, a4, asr #20
         orr    a2, a2, a4, lsl #8
-        strh   a2, [v2, -lr]
+        strh_dpre a2, v2, lr
 
         ldr    pc, [sp], #4
 endfunc
@@ -436,7 +449,7 @@ function idct_col_add_armv5te
         ldr    v1, [sp, #32]
         sub    a4, a4, v2
         rsb    v2, v1, v1, lsl #3
-        ldrh   ip, [v2, lr]!
+        ldrh_pre ip, v2, lr
         strh   a2, [lr]
         and    a2, ip, #255
         aclip  a3, a2, a3, asr #20
@@ -448,7 +461,7 @@ function idct_col_add_armv5te
         strh   a2, [v2]
 
         ldmfd  sp!, {a3, a4}
-        ldrh   ip, [lr, v1]!
+        ldrh_pre ip, lr, v1
         sub    a2, a3, v3
         add    a3, a3, v3
         and    v3, ip, #255
@@ -458,7 +471,7 @@ function idct_col_add_armv5te
         aclip  v3, v3, ip, lsr #8
         orr    a2, a2, v3, lsl #8
         add    a4, a4, v4
-        ldrh   ip, [v2, -v1]!
+        ldrh_dpre ip, v2, v1
         strh   a2, [lr]
         and    a2, ip, #255
         aclip  a3, a2, a3, asr #20
@@ -468,7 +481,7 @@ function idct_col_add_armv5te
         strh   a2, [v2]
 
         ldmfd  sp!, {a3, a4}
-        ldrh   ip, [lr, v1]!
+        ldrh_pre ip, lr, v1
         add    a2, a3, v5
         sub    a3, a3, v5
         and    v3, ip, #255
@@ -478,7 +491,7 @@ function idct_col_add_armv5te
         aclip  v3, v3, ip, lsr #8
         orr    a2, a2, v3, lsl #8
         sub    a4, a4, v6
-        ldrh   ip, [v2, -v1]!
+        ldrh_dpre ip, v2, v1
         strh   a2, [lr]
         and    a2, ip, #255
         aclip  a3, a2, a3, asr #20
@@ -488,7 +501,7 @@ function idct_col_add_armv5te
         strh   a2, [v2]
 
         ldmfd  sp!, {a3, a4}
-        ldrh   ip, [lr, v1]!
+        ldrh_pre ip, lr, v1
         add    a2, a3, v7
         sub    a3, a3, v7
         and    v3, ip, #255
@@ -498,7 +511,7 @@ function idct_col_add_armv5te
         aclip  v3, v3, ip, lsr #8
         orr    a2, a2, v3, lsl #8
         sub    a4, a4, fp
-        ldrh   ip, [v2, -v1]!
+        ldrh_dpre ip, v2, v1
         strh   a2, [lr]
         and    a2, ip, #255
         aclip  a3, a2, a3, asr #20
diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S
index d61c1fd3ea..a176b3a7b4 100644
--- a/libavcodec/arm/simple_idct_armv6.S
+++ b/libavcodec/arm/simple_idct_armv6.S
@@ -200,6 +200,7 @@ function idct_row_armv6
         ldr    r3, [r0, #8]          /* r3 = row[3,1] */
         ldr    r2, [r0]              /* r2 = row[2,0] */
         orrs   lr, lr, ip
+        itt    eq
         cmpeq  lr, r3
         cmpeq  lr, r2, lsr #16
         beq    1f
@@ -282,14 +283,14 @@ function idct_col_put_armv6
         pop    {r1, r2}
         idct_finish_shift_sat COL_SHIFT
 
-        strb   r4, [r1], r2
-        strb   r5, [r1], r2
-        strb   r6, [r1], r2
-        strb   r7, [r1], r2
-        strb   r11,[r1], r2
-        strb   r10,[r1], r2
-        strb   r9, [r1], r2
-        strb   r8, [r1], r2
+        strb_post r4, r1, r2
+        strb_post r5, r1, r2
+        strb_post r6, r1, r2
+        strb_post r7, r1, r2
+        strb_post r11,r1, r2
+        strb_post r10,r1, r2
+        strb_post r9, r1, r2
+        strb_post r8, r1, r2
 
         sub    r1, r1, r2, lsl #3
 
@@ -318,16 +319,16 @@ function idct_col_add_armv6
         add    ip, r3, ip, asr #COL_SHIFT
         usat   ip, #8, ip
         add    r4, r7, r4, asr #COL_SHIFT
-        strb   ip, [r1], r2
+        strb_post ip, r1, r2
         ldrb   ip, [r1, r2]
         usat   r4, #8, r4
         ldrb   r11,[r1, r2, lsl #2]
         add    r5, ip, r5, asr #COL_SHIFT
         usat   r5, #8, r5
-        strb   r4, [r1], r2
+        strb_post r4, r1, r2
         ldrb   r3, [r1, r2]
         ldrb   ip, [r1, r2, lsl #2]
-        strb   r5, [r1], r2
+        strb_post r5, r1, r2
         ldrb   r7, [r1, r2]
         ldrb   r4, [r1, r2, lsl #2]
         add    r6, r3, r6, asr #COL_SHIFT
@@ -340,11 +341,11 @@ function idct_col_add_armv6
         usat   r8, #8, r8
         add    lr, r4, lr, asr #COL_SHIFT
         usat   lr, #8, lr
-        strb   r6, [r1], r2
-        strb   r10,[r1], r2
-        strb   r9, [r1], r2
-        strb   r8, [r1], r2
-        strb   lr, [r1], r2
+        strb_post r6, r1, r2
+        strb_post r10,r1, r2
+        strb_post r9, r1, r2
+        strb_post r8, r1, r2
+        strb_post lr, r1, r2
 
         sub    r1, r1, r2, lsl #3
 
diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
index 17cde5835a..64a7fbf13a 100644
--- a/libavcodec/arm/simple_idct_neon.S
+++ b/libavcodec/arm/simple_idct_neon.S
@@ -71,7 +71,7 @@ function idct_row4_pld_neon
         add             r3,  r0,  r1,  lsl #2
         pld             [r0, r1]
         pld             [r0, r1, lsl #1]
-        pld             [r3, -r1]
+A       pld             [r3, -r1]
         pld             [r3]
         pld             [r3, r1]
         add             r3,  r3,  r1,  lsl #1
@@ -164,6 +164,7 @@ function idct_col4_neon
         orrs            r4,  r4,  r5
 
         idct_col4_top
+        it              eq
         addeq           r2,  r2,  #16
         beq             1f
 
@@ -176,6 +177,7 @@ function idct_col4_neon
 
 1:      orrs            r6,  r6,  r7
         ldrd            r4,  [r2, #16]
+        it              eq
         addeq           r2,  r2,  #16
         beq             2f
 
@@ -187,6 +189,7 @@ function idct_col4_neon
 
 2:      orrs            r4,  r4,  r5
         ldrd            r4,  [r2, #16]
+        it              eq
         addeq           r2,  r2,  #16
         beq             3f
 
@@ -199,6 +202,7 @@ function idct_col4_neon
         vadd.i32        q13, q13, q8
 
 3:      orrs            r4,  r4,  r5
+        it              eq
         addeq           r2,  r2,  #16
         beq             4f
 
diff --git a/libavcodec/arm/synth_filter_neon.S b/libavcodec/arm/synth_filter_neon.S
index 1464abe562..3f91d67506 100644
--- a/libavcodec/arm/synth_filter_neon.S
+++ b/libavcodec/arm/synth_filter_neon.S
@@ -100,9 +100,11 @@ NOVFP   vldr            s0,  [sp, #12*4]        @ scale
         vst1.32         {q9},     [r2,:128]
 
         subs            r1,  r1,  #1
+        it              eq
         popeq           {r4-r11,pc}
 
         cmp             r4,  #0
+        itt             eq
         subeq           r8,  r8,  #512*4
         subeq           r9,  r9,  #512*4
         sub             r5,  r5,  #512*4
diff --git a/libavcodec/arm/vp56_arith.h b/libavcodec/arm/vp56_arith.h
index cd02579e5b..ece9ac2a6c 100644
--- a/libavcodec/arm/vp56_arith.h
+++ b/libavcodec/arm/vp56_arith.h
@@ -21,6 +21,14 @@
 #ifndef AVCODEC_ARM_VP56_ARITH_H
 #define AVCODEC_ARM_VP56_ARITH_H
 
+#if CONFIG_THUMB
+#   define A(x)
+#   define T(x) x
+#else
+#   define A(x) x
+#   define T(x)
+#endif
+
 #if HAVE_ARMV6 && HAVE_INLINE_ASM
 
 #define vp56_rac_get_prob vp56_rac_get_prob_armv6
@@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
     unsigned bit;
 
     __asm__ ("adds    %3,  %3,  %0           \n"
+             "itt     cs                     \n"
              "cmpcs   %7,  %4                \n"
-             "ldrcsh  %2,  [%4], #2          \n"
+           A("ldrcsh  %2,  [%4], #2          \n")
+           T("ldrhcs  %2,  [%4], #2          \n")
              "rsb     %0,  %6,  #256         \n"
              "smlabb  %0,  %5,  %6,  %0      \n"
+           T("itttt   cs                     \n")
              "rev16cs %2,  %2                \n"
-             "orrcs   %1,  %1,  %2,  lsl %3  \n"
+           T("lslcs   %2,  %2,  %3           \n")
+           T("orrcs   %1,  %1,  %2           \n")
+           A("orrcs   %1,  %1,  %2,  lsl %3  \n")
              "subcs   %3,  %3,  #16          \n"
              "lsr     %0,  %0,  #8           \n"
              "cmp     %1,  %0,  lsl #16      \n"
+             "ittte   ge                     \n"
              "subge   %1,  %1,  %0,  lsl #16 \n"
              "subge   %0,  %5,  %0           \n"
              "movge   %2,  #1                \n"
@@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
     unsigned tmp;
 
     __asm__ ("adds    %3,  %3,  %0           \n"
+             "itt     cs                     \n"
              "cmpcs   %7,  %4                \n"
-             "ldrcsh  %2,  [%4], #2          \n"
+           A("ldrcsh  %2,  [%4], #2          \n")
+           T("ldrhcs  %2,  [%4], #2          \n")
              "rsb     %0,  %6,  #256         \n"
              "smlabb  %0,  %5,  %6,  %0      \n"
+           T("itttt   cs                     \n")
              "rev16cs %2,  %2                \n"
-             "orrcs   %1,  %1,  %2,  lsl %3  \n"
+           T("lslcs   %2,  %2,  %3           \n")
+           T("orrcs   %1,  %1,  %2           \n")
+           A("orrcs   %1,  %1,  %2,  lsl %3  \n")
              "subcs   %3,  %3,  #16          \n"
              "lsr     %0,  %0,  #8           \n"
              "lsl     %2,  %0,  #16          \n"
diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S
index 93f4dd664b..b995360e0c 100644
--- a/libavcodec/arm/vp8_armv6.S
+++ b/libavcodec/arm/vp8_armv6.S
@@ -25,13 +25,18 @@
         lsl             \cw, \cw, \t0
         lsl             \t0, \h,  \t0
         rsb             \h,  \pr, #256
+        it              cs
         ldrhcs          \t1, [\buf], #2
         smlabb          \h,  \t0, \pr, \h
+T       itttt           cs
         rev16cs         \t1, \t1
-        orrcs           \cw, \cw, \t1, lsl \bs
+A       orrcs           \cw, \cw, \t1, lsl \bs
+T       lslcs           \t1, \t1, \bs
+T       orrcs           \cw, \cw, \t1
         subcs           \bs, \bs, #16
         lsr             \h,  \h,  #8
         cmp             \cw, \h,  lsl #16
+        itt             ge
         subge           \cw, \cw, \h,  lsl #16
         subge           \h,  \t0, \h
 .endm
@@ -40,14 +45,20 @@
         adds            \bs, \bs, \t0
         lsl             \cw, \cw, \t0
         lsl             \t0, \h,  \t0
+        it              cs
         ldrhcs          \t1, [\buf], #2
         mov             \h,  #128
+        it              cs
         rev16cs         \t1, \t1
         add             \h,  \h,  \t0, lsl #7
-        orrcs           \cw, \cw, \t1, lsl \bs
+A       orrcs           \cw, \cw, \t1, lsl \bs
+T       ittt            cs
+T       lslcs           \t1, \t1, \bs
+T       orrcs           \cw, \cw, \t1
         subcs           \bs, \bs, #16
         lsr             \h,  \h,  #8
         cmp             \cw, \h,  lsl #16
+        itt             ge
         subge           \cw, \cw, \h,  lsl #16
         subge           \h,  \t0, \h
 .endm
@@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1
         cmp             r3,  #0
         ldr             r11, [r5]
         ldm             r0,  {r5-r7}                    @ high, bits, buf
+        it              ne
         pkhtbne         r11, r11, r11, asr #16
         ldr             r8,  [r0, #16]                  @ code_word
 0:
@@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1
         adds            r6,  r6,  r9
         add             r4,  r4,  #11
         lsl             r8,  r8,  r9
+        it              cs
         ldrhcs          r10, [r7], #2
         lsl             r9,  r5,  r9
         mov             r5,  #128
+        it              cs
         rev16cs         r10, r10
         add             r5,  r5,  r9,  lsl #7
-        orrcs           r8,  r8,  r10, lsl r6
+T       ittt            cs
+T       lslcs           r10, r10, r6
+T       orrcs           r8,  r8,  r10
+A       orrcs           r8,  r8,  r10, lsl r6
         subcs           r6,  r6,  #16
         lsr             r5,  r5,  #8
         cmp             r8,  r5,  lsl #16
         movrel          r10, zigzag_scan-1
+        itt             ge
         subge           r8,  r8,  r5,  lsl #16
         subge           r5,  r9,  r5
         ldrb            r10, [r10, r3]
+        it              ge
         rsbge           r12, r12, #0
         cmp             r3,  #16
         strh            r12, [r1, r10]
@@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1
         ldr             r0,  [sp]
         ldr             r9,  [r0, #12]
         cmp             r7,  r9
+        it              hi
         movhi           r7,  r9
         stm             r0,  {r5-r7}                    @ high, bits, buf
         str             r8,  [r0, #16]                  @ code_word
@@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1
         mov             r12, #2
         ldrb            r0,  [r4, #4]
         rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        it              ge
         addge           r12, #1
         ldrb            r9,  [lr, r5]
         blt             4f
         ldrb            r0,  [r4, #5]
         rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        it              ge
         addge           r12, #1
         ldrb            r9,  [lr, r5]
         b               4f
@@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1
         mov             r12, #5
         mov             r0,  #159
         rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        it              ge
         addge           r12, r12, #1
         ldrb            r9,  [lr, r5]
         b               4f
@@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1
         mov             r12, #7
         mov             r0,  #165
         rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        it              ge
         addge           r12, r12, #2
         ldrb            r9,  [lr, r5]
         mov             r0,  #145
         rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        it              ge
         addge           r12, r12, #1
         ldrb            r9,  [lr, r5]
         b               4f
 3:
         ldrb            r0,  [r4, #8]
         rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
+        it              ge
         addge           r4,  r4,  #1
         ldrb            r9,  [lr, r5]
+        ite             ge
         movge           r12, #2
         movlt           r12, #0
         ldrb            r0,  [r4, #9]
         rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
         mov             r9,  #8
+        it              ge
         addge           r12, r12, #1
         movrel          r4,  X(ff_vp8_dct_cat_prob)
         lsl             r9,  r9,  r12
@@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1
         lsl             r1,  r1,  #1
         rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
         ldrb            r0,  [r4], #1
+        it              ge
         addge           r1,  r1,  #1
         cmp             r0,  #0
         bne             1b
@@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1
         add             r4,  r2,  r4
         add             r4,  r4,  #22
         rac_get_128     r5,  r6,  r7,  r8,  r9,  r10
+        it              ge
         rsbge           r12, r12, #0
         smulbb          r12, r12, r11
         movrel          r9,  zigzag_scan-1
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
index 23330900f7..28487e7a60 100644
--- a/libavcodec/arm/vp8dsp_neon.S
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1
         push            {r4-r6,lr}
 1:
         subs            r12, r12, #4
-        ldr             r4,       [r2], r3
-        ldr             r5,       [r2], r3
-        ldr             r6,       [r2], r3
-        ldr             lr,       [r2], r3
-        str             r4,       [r0], r1
-        str             r5,       [r0], r1
-        str             r6,       [r0], r1
-        str             lr,       [r0], r1
+        ldr_post        r4,  r2,  r3
+        ldr_post        r5,  r2,  r3
+        ldr_post        r6,  r2,  r3
+        ldr_post        lr,  r2,  r3
+        str_post        r4,  r0,  r1
+        str_post        r5,  r0,  r1
+        str_post        r6,  r0,  r1
+        str_post        lr,  r0,  r1
         bgt             1b
         pop             {r4-r6,pc}
 endfunc
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 90c389b8c5..42eabdd623 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -3320,7 +3320,7 @@ void av_resample_close(struct AVResampleContext *c);
 /**
  * Allocate memory for a picture.  Call avpicture_free() to free it.
  *
- * \see avpicture_fill()
+ * @see avpicture_fill()
  *
  * @param picture the picture to be filled in
  * @param pix_fmt the format of the picture
@@ -3367,7 +3367,7 @@ int avpicture_fill(AVPicture *picture, uint8_t *ptr,
  * The data is stored compactly, without any gaps for alignment or padding
  * which may be applied by avpicture_fill().
  *
- * \see avpicture_get_size()
+ * @see avpicture_get_size()
  *
  * @param[in] src AVPicture containing image data
  * @param[in] pix_fmt The format in which the picture data is stored.
@@ -3964,7 +3964,7 @@ typedef struct AVCodecParserContext {
     int64_t offset;      ///< byte offset from starting packet start
     int64_t cur_frame_end[AV_PARSER_PTS_NB];
 
-    /*!
+    /**
      * Set by parser to 1 for key frames and 0 for non-key frames.
      * It is initialized to -1, so if the parser doesn't set this flag,
      * old-style fallback using AV_PICTURE_TYPE_I picture type as key frames
@@ -4211,7 +4211,7 @@ void av_log_missing_feature(void *avc, const char *feature, int want_sample);
  * a pointer to an AVClass struct
  * @param[in] msg string containing an optional message, or NULL if no message
  */
-void av_log_ask_for_sample(void *avc, const char *msg, ...);
+void av_log_ask_for_sample(void *avc, const char *msg, ...) av_printf_format(2, 3);
 
 /**
  * Register the hardware accelerator hwaccel.
diff --git a/libavcodec/celp_filters.h b/libavcodec/celp_filters.h
index 145e3d3346..2fb2b03aaa 100644
--- a/libavcodec/celp_filters.h
+++ b/libavcodec/celp_filters.h
@@ -34,7 +34,7 @@
  *
  *  fc_out[n] = sum(i,0,len-1){ fc_in[i] * filter[(len + n - i)%len] }
  *
- * \note fc_in and fc_out should not overlap!
+ * @note fc_in and fc_out should not overlap!
  */
 void ff_celp_convolve_circ(int16_t *fc_out, const int16_t *fc_in,
                            const int16_t *filter, int len);
diff --git a/libavcodec/fft.h b/libavcodec/fft.h
index 24db7e3d24..0e19e947b1 100644
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@@ -119,7 +119,7 @@ extern COSTABLE_CONST FFTSample* const FFT_NAME(ff_cos_tabs)[17];
 
 /**
  * Initialize the cosine table in ff_cos_tabs[index]
- * \param index index in ff_cos_tabs array of the table to initialize
+ * @param index index in ff_cos_tabs array of the table to initialize
  */
 void ff_init_ff_cos_tabs(int index);
 
diff --git a/libavcodec/g729dec.c b/libavcodec/g729dec.c
index 32db0597e3..c4a883f392 100644
--- a/libavcodec/g729dec.c
+++ b/libavcodec/g729dec.c
@@ -116,7 +116,7 @@ static const G729FormatDescription format_g729d_6k4 = {
 };
 
 /**
- * \brief pseudo random number generator
+ * @brief pseudo random number generator
  */
 static inline uint16_t g729_prng(uint16_t value)
 {
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index 080b6a93b5..27fba4b628 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -97,7 +97,7 @@ found:
     return i-(state&5);
 }
 
-/*!
+/**
  * Parse NAL units of found picture and decode some basic information.
  *
  * @param s parser context.
diff --git a/libavcodec/lagarith.c b/libavcodec/lagarith.c
index 02d3533b0c..5cff77f58c 100644
--- a/libavcodec/lagarith.c
+++ b/libavcodec/lagarith.c
@@ -32,25 +32,25 @@
 #include "lagarithrac.h"
 
 enum LagarithFrameType {
-    FRAME_RAW           = 1,    /*!< uncompressed */
-    FRAME_U_RGB24       = 2,    /*!< unaligned RGB24 */
-    FRAME_ARITH_YUY2    = 3,    /*!< arithmetic coded YUY2 */
-    FRAME_ARITH_RGB24   = 4,    /*!< arithmetic coded RGB24 */
-    FRAME_SOLID_GRAY    = 5,    /*!< solid grayscale color frame */
-    FRAME_SOLID_COLOR   = 6,    /*!< solid non-grayscale color frame */
-    FRAME_OLD_ARITH_RGB = 7,    /*!< obsolete arithmetic coded RGB (no longer encoded by upstream since version 1.1.0) */
-    FRAME_ARITH_RGBA    = 8,    /*!< arithmetic coded RGBA */
-    FRAME_SOLID_RGBA    = 9,    /*!< solid RGBA color frame */
-    FRAME_ARITH_YV12    = 10,   /*!< arithmetic coded YV12 */
-    FRAME_REDUCED_RES   = 11,   /*!< reduced resolution YV12 frame */
+    FRAME_RAW           = 1,    /**< uncompressed */
+    FRAME_U_RGB24       = 2,    /**< unaligned RGB24 */
+    FRAME_ARITH_YUY2    = 3,    /**< arithmetic coded YUY2 */
+    FRAME_ARITH_RGB24   = 4,    /**< arithmetic coded RGB24 */
+    FRAME_SOLID_GRAY    = 5,    /**< solid grayscale color frame */
+    FRAME_SOLID_COLOR   = 6,    /**< solid non-grayscale color frame */
+    FRAME_OLD_ARITH_RGB = 7,    /**< obsolete arithmetic coded RGB (no longer encoded by upstream since version 1.1.0) */
+    FRAME_ARITH_RGBA    = 8,    /**< arithmetic coded RGBA */
+    FRAME_SOLID_RGBA    = 9,    /**< solid RGBA color frame */
+    FRAME_ARITH_YV12    = 10,   /**< arithmetic coded YV12 */
+    FRAME_REDUCED_RES   = 11,   /**< reduced resolution YV12 frame */
 };
 
 typedef struct LagarithContext {
     AVCodecContext *avctx;
     AVFrame picture;
     DSPContext dsp;
-    int zeros;                  /*!< number of consecutive zero bytes encountered */
-    int zeros_rem;              /*!< number of zero bytes remaining to output */
+    int zeros;                  /**< number of consecutive zero bytes encountered */
+    int zeros_rem;              /**< number of zero bytes remaining to output */
 } LagarithContext;
 
 /**
diff --git a/libavcodec/lagarithrac.h b/libavcodec/lagarithrac.h
index 2cb7323076..8c78538f21 100644
--- a/libavcodec/lagarithrac.h
+++ b/libavcodec/lagarithrac.h
@@ -40,15 +40,15 @@ typedef struct lag_rac {
     AVCodecContext *avctx;
     unsigned low;
     unsigned range;
-    unsigned scale;             /*!< Number of bits of precision in range. */
-    unsigned hash_shift;        /*!< Number of bits to shift to calculate hash for radix search. */
+    unsigned scale;             /**< Number of bits of precision in range. */
+    unsigned hash_shift;        /**< Number of bits to shift to calculate hash for radix search. */
 
-    const uint8_t *bytestream_start;  /*!< Start of input bytestream. */
-    const uint8_t *bytestream;        /*!< Current position in input bytestream. */
-    const uint8_t *bytestream_end;    /*!< End position of input bytestream. */
+    const uint8_t *bytestream_start;  /**< Start of input bytestream. */
+    const uint8_t *bytestream;        /**< Current position in input bytestream. */
+    const uint8_t *bytestream_end;    /**< End position of input bytestream. */
 
-    uint32_t prob[258];         /*!< Table of cumulative probability for each symbol. */
-    uint8_t  range_hash[256];   /*!< Hash table mapping upper byte to approximate symbol. */
+    uint32_t prob[258];         /**< Table of cumulative probability for each symbol. */
+    uint8_t  range_hash[256];   /**< Hash table mapping upper byte to approximate symbol. */
 } lag_rac;
 
 void lag_rac_init(lag_rac *l, GetBitContext *gb, int length);
diff --git a/libavcodec/lcldec.c b/libavcodec/lcldec.c
index 57735ac6ff..7359864004 100644
--- a/libavcodec/lcldec.c
+++ b/libavcodec/lcldec.c
@@ -73,8 +73,8 @@ typedef struct LclDecContext {
 
 
 /**
- * \param srcptr compressed source buffer, must be padded with at least 5 extra bytes
- * \param destptr must be padded sufficiently for av_memcpy_backptr
+ * @param srcptr compressed source buffer, must be padded with at least 5 extra bytes
+ * @param destptr must be padded sufficiently for av_memcpy_backptr
  */
 static unsigned int mszh_decomp(const unsigned char * srcptr, int srclen, unsigned char * destptr, unsigned int destsize)
 {
@@ -119,11 +119,11 @@ static unsigned int mszh_decomp(const unsigned char * srcptr, int srclen, unsign
 
 #if CONFIG_ZLIB_DECODER
 /**
- * \brief decompress a zlib-compressed data block into decomp_buf
- * \param src compressed input buffer
- * \param src_len data length in input buffer
- * \param offset offset in decomp_buf
- * \param expected expected decompressed length
+ * @brief decompress a zlib-compressed data block into decomp_buf
+ * @param src compressed input buffer
+ * @param src_len data length in input buffer
+ * @param offset offset in decomp_buf
+ * @param expected expected decompressed length
  */
 static int zlib_decomp(AVCodecContext *avctx, const uint8_t *src, int src_len, int offset, int expected)
 {
diff --git a/libavcodec/lsp.c b/libavcodec/lsp.c
index 98ca490a76..0ff0f0986a 100644
--- a/libavcodec/lsp.c
+++ b/libavcodec/lsp.c
@@ -74,9 +74,9 @@ void ff_acelp_lsf2lspd(double *lsp, const float *lsf, int lp_order)
 }
 
 /**
- * \brief decodes polynomial coefficients from LSP
- * \param f [out] decoded polynomial coefficients (-0x20000000 <= (3.22) <= 0x1fffffff)
- * \param lsp LSP coefficients (-0x8000 <= (0.15) <= 0x7fff)
+ * @brief decodes polynomial coefficients from LSP
+ * @param f [out] decoded polynomial coefficients (-0x20000000 <= (3.22) <= 0x1fffffff)
+ * @param lsp LSP coefficients (-0x8000 <= (0.15) <= 0x7fff)
  */
 static void lsp2poly(int* f, const int16_t* lsp, int lp_half_order)
 {
diff --git a/libavcodec/lsp.h b/libavcodec/lsp.h
index e3af30d300..1230669b1a 100644
--- a/libavcodec/lsp.h
+++ b/libavcodec/lsp.h
@@ -30,12 +30,12 @@
 */
 
 /**
- * \brief ensure a minimum distance between LSFs
- * \param[in,out] lsfq LSF to check and adjust
- * \param lsfq_min_distance minimum distance between LSFs
- * \param lsfq_min minimum allowed LSF value
- * \param lsfq_max maximum allowed LSF value
- * \param lp_order LP filter order
+ * @brief ensure a minimum distance between LSFs
+ * @param[in,out] lsfq LSF to check and adjust
+ * @param lsfq_min_distance minimum distance between LSFs
+ * @param lsfq_min minimum allowed LSF value
+ * @param lsfq_max maximum allowed LSF value
+ * @param lp_order LP filter order
  */
 void ff_acelp_reorder_lsf(int16_t* lsfq, int lsfq_min_distance, int lsfq_min, int lsfq_max, int lp_order);
 
@@ -53,12 +53,12 @@ void ff_acelp_reorder_lsf(int16_t* lsfq, int lsfq_min_distance, int lsfq_min, in
 void ff_set_min_dist_lsf(float *lsf, double min_spacing, int size);
 
 /**
- * \brief Convert LSF to LSP
- * \param[out] lsp LSP coefficients (-0x8000 <= (0.15) < 0x8000)
- * \param lsf normalized LSF coefficients (0 <= (2.13) < 0x2000 * PI)
- * \param lp_order LP filter order
+ * @brief Convert LSF to LSP
+ * @param[out] lsp LSP coefficients (-0x8000 <= (0.15) < 0x8000)
+ * @param lsf normalized LSF coefficients (0 <= (2.13) < 0x2000 * PI)
+ * @param lp_order LP filter order
  *
- * \remark It is safe to pass the same array into the lsf and lsp parameters.
+ * @remark It is safe to pass the same array into the lsf and lsp parameters.
  */
 void ff_acelp_lsf2lsp(int16_t *lsp, const int16_t *lsf, int lp_order);
 
@@ -68,10 +68,10 @@ void ff_acelp_lsf2lsp(int16_t *lsp, const int16_t *lsf, int lp_order);
 void ff_acelp_lsf2lspd(double *lsp, const float *lsf, int lp_order);
 
 /**
- * \brief LSP to LP conversion (3.2.6 of G.729)
- * \param[out] lp decoded LP coefficients (-0x8000 <= (3.12) < 0x8000)
- * \param lsp LSP coefficients (-0x8000 <= (0.15) < 0x8000)
- * \param lp_half_order LP filter order, divided by 2
+ * @brief LSP to LP conversion (3.2.6 of G.729)
+ * @param[out] lp decoded LP coefficients (-0x8000 <= (3.12) < 0x8000)
+ * @param lsp LSP coefficients (-0x8000 <= (0.15) < 0x8000)
+ * @param lp_half_order LP filter order, divided by 2
  */
 void ff_acelp_lsp2lpc(int16_t* lp, const int16_t* lsp, int lp_half_order);
 
@@ -81,12 +81,12 @@ void ff_acelp_lsp2lpc(int16_t* lp, const int16_t* lsp, int lp_half_order);
 void ff_amrwb_lsp2lpc(const double *lsp, float *lp, int lp_order);
 
 /**
- * \brief Interpolate LSP for the first subframe and convert LSP -> LP for both subframes (3.2.5 and 3.2.6 of G.729)
- * \param[out] lp_1st decoded LP coefficients for first subframe  (-0x8000 <= (3.12) < 0x8000)
- * \param[out] lp_2nd decoded LP coefficients for second subframe (-0x8000 <= (3.12) < 0x8000)
- * \param lsp_2nd LSP coefficients of the second subframe (-0x8000 <= (0.15) < 0x8000)
- * \param lsp_prev LSP coefficients from the second subframe of the previous frame (-0x8000 <= (0.15) < 0x8000)
- * \param lp_order LP filter order
+ * @brief Interpolate LSP for the first subframe and convert LSP -> LP for both subframes (3.2.5 and 3.2.6 of G.729)
+ * @param[out] lp_1st decoded LP coefficients for first subframe  (-0x8000 <= (3.12) < 0x8000)
+ * @param[out] lp_2nd decoded LP coefficients for second subframe (-0x8000 <= (3.12) < 0x8000)
+ * @param lsp_2nd LSP coefficients of the second subframe (-0x8000 <= (0.15) < 0x8000)
+ * @param lsp_prev LSP coefficients from the second subframe of the previous frame (-0x8000 <= (0.15) < 0x8000)
+ * @param lp_order LP filter order
  */
 void ff_acelp_lp_decode(int16_t* lp_1st, int16_t* lp_2nd, const int16_t* lsp_2nd, const int16_t* lsp_prev, int lp_order);
 
diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c
index c12ebf4c7c..46068bfe3d 100644
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -248,7 +248,7 @@ static int cmp_internal(MpegEncContext *s, const int x, const int y, const int s
     }
 }
 
-/*! \brief compares a block (either a full macroblock or a partition thereof)
+/** @brief compares a block (either a full macroblock or a partition thereof)
     against a proposed motion-compensated prediction of that block
  */
 static av_always_inline int cmp(MpegEncContext *s, const int x, const int y, const int subx, const int suby,
diff --git a/libavcodec/motion_est_template.c b/libavcodec/motion_est_template.c
index 461e85932b..5d319c5da2 100644
--- a/libavcodec/motion_est_template.c
+++ b/libavcodec/motion_est_template.c
@@ -992,8 +992,8 @@ static av_always_inline int diamond_search(MpegEncContext * s, int *best, int dm
         return   var_diamond_search(s, best, dmin, src_index, ref_index, penalty_factor, size, h, flags);
 }
 
-/*!
-   \param P[10][2] a list of candidate mvs to check before starting the
+/**
+   @param P[10][2] a list of candidate mvs to check before starting the
    iterative search. If one of the candidates is close to the optimal mv, then
    it takes fewer iterations. And it increases the chance that we find the
    optimal mv.
@@ -1003,12 +1003,12 @@ static av_always_inline int epzs_motion_search_internal(MpegEncContext * s, int
                              int ref_mv_scale, int flags, int size, int h)
 {
     MotionEstContext * const c= &s->me;
-    int best[2]={0, 0};      /*!< x and y coordinates of the best motion vector.
+    int best[2]={0, 0};      /**< x and y coordinates of the best motion vector.
                                i.e. the difference between the position of the
                                block currently being encoded and the position of
                                the block chosen to predict it from. */
     int d;                   ///< the score (cmp + penalty) of any given mv
-    int dmin;                /*!< the best value of d, i.e. the score
+    int dmin;                /**< the best value of d, i.e. the score
                                corresponding to the mv stored in best[]. */
     int map_generation;
     int penalty_factor;
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index a0ff354a08..2a54329a49 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -154,7 +154,7 @@ typedef struct MotionEstContext{
     uint32_t *score_map;               ///< map to store the scores
     int map_generation;
     int pre_penalty_factor;
-    int penalty_factor;                /*!< an estimate of the bits required to
+    int penalty_factor;                /**< an estimate of the bits required to
                                         code a given mv value, e.g. (1,0) takes
                                         more bits than (0,0). We have to
                                         estimate whether any reduction in
diff --git a/libavcodec/nuv.c b/libavcodec/nuv.c
index 6eb6de3101..f31be59d8d 100644
--- a/libavcodec/nuv.c
+++ b/libavcodec/nuv.c
@@ -63,11 +63,11 @@ static const uint8_t fallback_cquant[] = {
 };
 
 /**
- * \brief copy frame data from buffer to AVFrame, handling stride.
- * \param f destination AVFrame
- * \param src source buffer, does not use any line-stride
- * \param width width of the video frame
- * \param height height of the video frame
+ * @brief copy frame data from buffer to AVFrame, handling stride.
+ * @param f destination AVFrame
+ * @param src source buffer, does not use any line-stride
+ * @param width width of the video frame
+ * @param height height of the video frame
  */
 static void copy_frame(AVFrame *f, const uint8_t *src,
                        int width, int height) {
@@ -77,7 +77,7 @@ static void copy_frame(AVFrame *f, const uint8_t *src,
 }
 
 /**
- * \brief extract quantization tables from codec data into our context
+ * @brief extract quantization tables from codec data into our context
  */
 static int get_quant(AVCodecContext *avctx, NuvContext *c,
                      const uint8_t *buf, int size) {
@@ -94,7 +94,7 @@ static int get_quant(AVCodecContext *avctx, NuvContext *c,
 }
 
 /**
- * \brief set quantization tables from a quality value
+ * @brief set quantization tables from a quality value
  */
 static void get_quant_quality(NuvContext *c, int quality) {
     int i;
diff --git a/libavcodec/qcelpdata.h b/libavcodec/qcelpdata.h
index d79cea9f6c..82cc61d7ea 100644
--- a/libavcodec/qcelpdata.h
+++ b/libavcodec/qcelpdata.h
@@ -74,9 +74,9 @@ typedef struct {
 static const float qcelp_hammsinc_table[4] = { -0.006822,  0.041249, -0.143459,  0.588863};
 
 typedef struct {
-    uint8_t index;  /*!< index into the QCELPContext structure */
-    uint8_t bitpos; /*!< position of the lowest bit in the value's byte */
-    uint8_t bitlen; /*!< number of bits to read */
+    uint8_t index;  /**< index into the QCELPContext structure */
+    uint8_t bitpos; /**< position of the lowest bit in the value's byte */
+    uint8_t bitlen; /**< number of bits to read */
 } QCELPBitmap;
 
 #define QCELP_OF(variable, bit, len) {offsetof(QCELPFrame, variable), bit, len}
diff --git a/libavcodec/qcelpdec.c b/libavcodec/qcelpdec.c
index 3ed821c81e..d565003a9e 100644
--- a/libavcodec/qcelpdec.c
+++ b/libavcodec/qcelpdec.c
@@ -46,7 +46,7 @@
 
 typedef enum
 {
-    I_F_Q = -1,    /*!< insufficient frame quality */
+    I_F_Q = -1,    /**< insufficient frame quality */
     SILENCE,
     RATE_OCTAVE,
     RATE_QUARTER,
@@ -58,12 +58,12 @@ typedef struct
 {
     GetBitContext     gb;
     qcelp_packet_rate bitrate;
-    QCELPFrame        frame;    /*!< unpacked data frame */
+    QCELPFrame        frame;    /**< unpacked data frame */
 
     uint8_t  erasure_count;
-    uint8_t  octave_count;      /*!< count the consecutive RATE_OCTAVE frames */
+    uint8_t  octave_count;      /**< count the consecutive RATE_OCTAVE frames */
     float    prev_lspf[10];
-    float    predictor_lspf[10];/*!< LSP predictor for RATE_OCTAVE and I_F_Q */
+    float    predictor_lspf[10];/**< LSP predictor for RATE_OCTAVE and I_F_Q */
     float    pitch_synthesis_filter_mem[303];
     float    pitch_pre_filter_mem[303];
     float    rnd_fir_filter_mem[180];
diff --git a/libavcodec/rtjpeg.c b/libavcodec/rtjpeg.c
index 4c48f25b2c..303183f230 100644
--- a/libavcodec/rtjpeg.c
+++ b/libavcodec/rtjpeg.c
@@ -33,12 +33,12 @@
     if (n) {skip_bits(gb, n);}
 
 /**
- * \brief read one block from stream
- * \param gb contains stream data
- * \param block where data is written to
- * \param scan array containing the mapping stream address -> block position
- * \param quant quantization factors
- * \return 0 means the block is not coded, < 0 means an error occurred.
+ * @brief read one block from stream
+ * @param gb contains stream data
+ * @param block where data is written to
+ * @param scan array containing the mapping stream address -> block position
+ * @param quant quantization factors
+ * @return 0 means the block is not coded, < 0 means an error occurred.
  *
  * Note: GetBitContext is used to make the code simpler, since all data is
  * aligned this could be done faster in a different way, e.g. as it is done
@@ -96,13 +96,13 @@ static inline int get_block(GetBitContext *gb, DCTELEM *block, const uint8_t *sc
 }
 
 /**
- * \brief decode one rtjpeg YUV420 frame
- * \param c context, must be initialized via rtjpeg_decode_init
- * \param f AVFrame to place decoded frame into. If parts of the frame
+ * @brief decode one rtjpeg YUV420 frame
+ * @param c context, must be initialized via rtjpeg_decode_init
+ * @param f AVFrame to place decoded frame into. If parts of the frame
  *          are not coded they are left unchanged, so consider initializing it
- * \param buf buffer containing input data
- * \param buf_size length of input data in bytes
- * \return number of bytes consumed from the input buffer
+ * @param buf buffer containing input data
+ * @param buf_size length of input data in bytes
+ * @return number of bytes consumed from the input buffer
  */
 int rtjpeg_decode_frame_yuv420(RTJpegContext *c, AVFrame *f,
                                const uint8_t *buf, int buf_size) {
@@ -143,15 +143,15 @@ int rtjpeg_decode_frame_yuv420(RTJpegContext *c, AVFrame *f,
 }
 
 /**
- * \brief initialize an RTJpegContext, may be called multiple times
- * \param c context to initialize
- * \param dsp specifies the idct to use for decoding
- * \param width width of image, will be rounded down to the nearest multiple
+ * @brief initialize an RTJpegContext, may be called multiple times
+ * @param c context to initialize
+ * @param dsp specifies the idct to use for decoding
+ * @param width width of image, will be rounded down to the nearest multiple
  *              of 16 for decoding
- * \param height height of image, will be rounded down to the nearest multiple
+ * @param height height of image, will be rounded down to the nearest multiple
  *              of 16 for decoding
- * \param lquant luma quantization table to use
- * \param cquant chroma quantization table to use
+ * @param lquant luma quantization table to use
+ * @param cquant chroma quantization table to use
  */
 void rtjpeg_decode_init(RTJpegContext *c, DSPContext *dsp,
                         int width, int height,
diff --git a/libavcodec/tableprint.h b/libavcodec/tableprint.h
index d81b9a387b..d3e4dd956f 100644
--- a/libavcodec/tableprint.h
+++ b/libavcodec/tableprint.h
@@ -56,7 +56,7 @@ void write_##type##_2d_array(const void *arg, int len, int len2)\
 }
 
 /**
- * \defgroup printfuncs Predefined functions for printing tables
+ * @defgroup printfuncs Predefined functions for printing tables
  *
  * \{
  */
diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c
index f8e75bb933..e7aceebd5b 100644
--- a/libavcodec/twinvq.c
+++ b/libavcodec/twinvq.c
@@ -411,7 +411,7 @@ static inline float mulawinv(float y, float clip, float mu)
  * a*b == 200 and the nearest integer is ill-defined, use a table to emulate
  * the following broken float-based implementation used by the binary decoder:
  *
- * \code
+ * @code
  * static int very_broken_op(int a, int b)
  * {
  *    static float test; // Ugh, force gcc to do the division first...
@@ -419,7 +419,7 @@ static inline float mulawinv(float y, float clip, float mu)
  *    test = a/400.;
  *    return b * test +  0.5;
  * }
- * \endcode
+ * @endcode
  *
  * @note if this function is replaced by just ROUNDED_DIV(a*b,400.), the stddev
  * between the original file (before encoding with Yamaha encoder) and the
@@ -938,14 +938,14 @@ static void permutate_in_line(int16_t *tab, int num_vect, int num_blocks,
 /**
  * Interpret the input data as in the following table:
  *
- * \verbatim
+ * @verbatim
  *
  * abcdefgh
  * ijklmnop
  * qrstuvw
  * x123456
  *
- * \endverbatim
+ * @endverbatim
  *
  * and transpose it, giving the output
  * aiqxbjr1cks2dlt3emu4fvn5gow6hp
diff --git a/libavcodec/vaapi.c b/libavcodec/vaapi.c
index de028a0a7e..774fde840f 100644
--- a/libavcodec/vaapi.c
+++ b/libavcodec/vaapi.c
@@ -24,7 +24,7 @@
 #include "vaapi_internal.h"
 
 /**
- * \addtogroup VAAPI_Decoding
+ * @addtogroup VAAPI_Decoding
  *
  * @{
  */
diff --git a/libavcodec/vaapi.h b/libavcodec/vaapi.h
index 07568a47fc..4c3bb9bb52 100644
--- a/libavcodec/vaapi.h
+++ b/libavcodec/vaapi.h
@@ -27,8 +27,8 @@
 #include <stdint.h>
 
 /**
- * \defgroup VAAPI_Decoding VA API Decoding
- * \ingroup Decoder
+ * @defgroup VAAPI_Decoding VA API Decoding
+ * @ingroup Decoder
  * @{
  */
 
diff --git a/libavcodec/vaapi_internal.h b/libavcodec/vaapi_internal.h
index 2c0fdf945e..43fa889d15 100644
--- a/libavcodec/vaapi_internal.h
+++ b/libavcodec/vaapi_internal.h
@@ -30,7 +30,7 @@
 #include "mpegvideo.h"
 
 /**
- * \addtogroup VAAPI_Decoding
+ * @addtogroup VAAPI_Decoding
  *
  * @{
  */
diff --git a/libavcodec/vdpau.c b/libavcodec/vdpau.c
index 19bd96bc15..9fbcbf9a3f 100644
--- a/libavcodec/vdpau.c
+++ b/libavcodec/vdpau.c
@@ -33,7 +33,7 @@
 #include "vdpau_internal.h"
 
 /**
- * \addtogroup VDPAU_Decoding
+ * @addtogroup VDPAU_Decoding
  *
  * @{
  */
diff --git a/libavcodec/vdpau.h b/libavcodec/vdpau.h
index 0dc6fb850b..f3a547184d 100644
--- a/libavcodec/vdpau.h
+++ b/libavcodec/vdpau.h
@@ -25,7 +25,7 @@
 #define AVCODEC_VDPAU_H
 
 /**
- * \defgroup Decoder VDPAU Decoder and Renderer
+ * @defgroup Decoder VDPAU Decoder and Renderer
  *
  * VDPAU hardware acceleration has two modules
  * - VDPAU decoding
@@ -38,25 +38,25 @@
  * and rendering (API calls) are done as part of the VDPAU
  * presentation (vo_vdpau.c) module.
  *
- * \defgroup  VDPAU_Decoding VDPAU Decoding
- * \ingroup Decoder
+ * @defgroup  VDPAU_Decoding VDPAU Decoding
+ * @ingroup Decoder
  * @{
  */
 
 #include <vdpau/vdpau.h>
 #include <vdpau/vdpau_x11.h>
 
-/** \brief The videoSurface is used for rendering. */
+/** @brief The videoSurface is used for rendering. */
 #define FF_VDPAU_STATE_USED_FOR_RENDER 1
 
 /**
- * \brief The videoSurface is needed for reference/prediction.
+ * @brief The videoSurface is needed for reference/prediction.
  * The codec manipulates this.
  */
 #define FF_VDPAU_STATE_USED_FOR_REFERENCE 2
 
 /**
- * \brief This structure is used as a callback between the FFmpeg
+ * @brief This structure is used as a callback between the FFmpeg
  * decoder (vd_) and presentation (vo_) module.
  * This is used for defining a video frame containing surface,
  * picture parameter, bitstream information etc which are passed
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index 5185d61e54..fc75a57519 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -43,7 +43,7 @@
 #include "idct_xvid.h"
 #include "dsputil_mmx.h"
 
-/*!
+/**
  * @file
  * @brief SSE2 idct compatible with xvidmmx
  */
diff --git a/libavcodec/x86/idct_xvid.h b/libavcodec/x86/idct_xvid.h
index 5fdc20d3ea..be91d1c68a 100644
--- a/libavcodec/x86/idct_xvid.h
+++ b/libavcodec/x86/idct_xvid.h
@@ -18,7 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/*!
+/**
  * @file
  * header for Xvid IDCT functions
  */
diff --git a/libavcodec/xsubenc.c b/libavcodec/xsubenc.c
index a7e3a891d4..0e950d1856 100644
--- a/libavcodec/xsubenc.c
+++ b/libavcodec/xsubenc.c
@@ -36,8 +36,8 @@
 
 /**
  * Encode a single color run. At most 16 bits will be used.
- * \param len   length of the run, values > 255 mean "until end of line", may not be < 0.
- * \param color color to encode, only the lowest two bits are used and all others must be 0.
+ * @param len   length of the run, values > 255 mean "until end of line", may not be < 0.
+ * @param color color to encode, only the lowest two bits are used and all others must be 0.
  */
 static void put_xsub_rle(PutBitContext *pb, int len, int color)
 {