summaryrefslogtreecommitdiff
path: root/libavcodec/arm
diff options
context:
space:
mode:
authorMåns Rullgård <mans@mansr.com>2008-12-17 00:54:54 +0000
committerMåns Rullgård <mans@mansr.com>2008-12-17 00:54:54 +0000
commita2fc0f6a6ddf884ace3c96a0d4f09f0932e6db32 (patch)
tree44b807b924e29465a6aed924fcb53c719a83a956 /libavcodec/arm
parent2600f8c86dcaa411a0485b1518e5e1592374aaf6 (diff)
ARM: replace "armv4l" with "arm"
Originally committed as revision 16179 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/arm')
-rw-r--r--libavcodec/arm/asm.S36
-rw-r--r--libavcodec/arm/dsputil_arm.c217
-rw-r--r--libavcodec/arm/dsputil_arm_s.S799
-rw-r--r--libavcodec/arm/dsputil_iwmmxt.c205
-rw-r--r--libavcodec/arm/dsputil_iwmmxt_rnd_template.c1114
-rw-r--r--libavcodec/arm/dsputil_neon.c169
-rw-r--r--libavcodec/arm/dsputil_neon_s.S274
-rw-r--r--libavcodec/arm/dsputil_vfp.S189
-rw-r--r--libavcodec/arm/float_arm_vfp.c35
-rw-r--r--libavcodec/arm/h264dsp_neon.S1377
-rw-r--r--libavcodec/arm/h264idct_neon.S96
-rw-r--r--libavcodec/arm/jrevdct_arm.S388
-rw-r--r--libavcodec/arm/mathops.h93
-rw-r--r--libavcodec/arm/mpegvideo_arm.c40
-rw-r--r--libavcodec/arm/mpegvideo_armv5te.c100
-rw-r--r--libavcodec/arm/mpegvideo_armv5te_s.S117
-rw-r--r--libavcodec/arm/mpegvideo_iwmmxt.c119
-rw-r--r--libavcodec/arm/simple_idct_arm.S486
-rw-r--r--libavcodec/arm/simple_idct_armv5te.S703
-rw-r--r--libavcodec/arm/simple_idct_armv6.S433
-rw-r--r--libavcodec/arm/simple_idct_neon.S362
21 files changed, 7352 insertions, 0 deletions
diff --git a/libavcodec/arm/asm.S b/libavcodec/arm/asm.S
new file mode 100644
index 0000000000..e2595f4789
--- /dev/null
+++ b/libavcodec/arm/asm.S
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ .macro require8, val=1
+ .eabi_attribute 24, \val
+ .endm
+
+ .macro preserve8, val=1
+ .eabi_attribute 25, \val
+ .endm
+
+ .macro function name, export=0
+.if \export
+ .global \name
+.endif
+ .type \name, %function
+ .func \name
+\name:
+ .endm
diff --git a/libavcodec/arm/dsputil_arm.c b/libavcodec/arm/dsputil_arm.c
new file mode 100644
index 0000000000..eaa6b9eb8d
--- /dev/null
+++ b/libavcodec/arm/dsputil_arm.c
@@ -0,0 +1,217 @@
+/*
+ * ARM optimized DSP utils
+ * Copyright (c) 2001 Lionel Ulmer.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+#ifdef HAVE_IPP
+#include <ipp.h>
+#endif
+
+void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
+void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx);
+void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
+
+void j_rev_dct_ARM(DCTELEM *data);
+void simple_idct_ARM(DCTELEM *data);
+
+void simple_idct_armv5te(DCTELEM *data);
+void simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
+void simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
+
+void ff_simple_idct_armv6(DCTELEM *data);
+void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data);
+
+void ff_simple_idct_neon(DCTELEM *data);
+void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
+
+/* XXX: local hack */
+static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
+static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
+
+void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void ff_prefetch_arm(void *mem, int stride, int h);
+
+CALL_2X_PIXELS(put_pixels16_x2_arm , put_pixels8_x2_arm , 8)
+CALL_2X_PIXELS(put_pixels16_y2_arm , put_pixels8_y2_arm , 8)
+CALL_2X_PIXELS(put_pixels16_xy2_arm, put_pixels8_xy2_arm, 8)
+CALL_2X_PIXELS(put_no_rnd_pixels16_x2_arm , put_no_rnd_pixels8_x2_arm , 8)
+CALL_2X_PIXELS(put_no_rnd_pixels16_y2_arm , put_no_rnd_pixels8_y2_arm , 8)
+CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_arm, put_no_rnd_pixels8_xy2_arm, 8)
+
+void ff_add_pixels_clamped_ARM(short *block, unsigned char *dest,
+ int line_size);
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+ converted */
+static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ j_rev_dct_ARM (block);
+ ff_put_pixels_clamped(block, dest, line_size);
+}
+static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ j_rev_dct_ARM (block);
+ ff_add_pixels_clamped(block, dest, line_size);
+}
+static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ simple_idct_ARM (block);
+ ff_put_pixels_clamped(block, dest, line_size);
+}
+static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ simple_idct_ARM (block);
+ ff_add_pixels_clamped(block, dest, line_size);
+}
+
+#ifdef HAVE_IPP
+static void simple_idct_ipp(DCTELEM *block)
+{
+ ippiDCT8x8Inv_Video_16s_C1I(block);
+}
+static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
+}
+
+void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
+
+static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ippiDCT8x8Inv_Video_16s_C1I(block);
+#ifdef HAVE_IWMMXT
+ add_pixels_clamped_iwmmxt(block, dest, line_size);
+#else
+ ff_add_pixels_clamped_ARM(block, dest, line_size);
+#endif
+}
+#endif
+
+int mm_support(void)
+{
+ return ENABLE_IWMMXT * FF_MM_IWMMXT;
+}
+
+void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)
+{
+ int idct_algo= avctx->idct_algo;
+
+ ff_put_pixels_clamped = c->put_pixels_clamped;
+ ff_add_pixels_clamped = c->add_pixels_clamped;
+
+ if (avctx->lowres == 0) {
+ if(idct_algo == FF_IDCT_AUTO){
+#if defined(HAVE_IPP)
+ idct_algo = FF_IDCT_IPP;
+#elif defined(HAVE_NEON)
+ idct_algo = FF_IDCT_SIMPLENEON;
+#elif defined(HAVE_ARMV6)
+ idct_algo = FF_IDCT_SIMPLEARMV6;
+#elif defined(HAVE_ARMV5TE)
+ idct_algo = FF_IDCT_SIMPLEARMV5TE;
+#else
+ idct_algo = FF_IDCT_ARM;
+#endif
+ }
+
+ if(idct_algo==FF_IDCT_ARM){
+ c->idct_put= j_rev_dct_ARM_put;
+ c->idct_add= j_rev_dct_ARM_add;
+ c->idct = j_rev_dct_ARM;
+ c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
+ } else if (idct_algo==FF_IDCT_SIMPLEARM){
+ c->idct_put= simple_idct_ARM_put;
+ c->idct_add= simple_idct_ARM_add;
+ c->idct = simple_idct_ARM;
+ c->idct_permutation_type= FF_NO_IDCT_PERM;
+#ifdef HAVE_ARMV6
+ } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
+ c->idct_put= ff_simple_idct_put_armv6;
+ c->idct_add= ff_simple_idct_add_armv6;
+ c->idct = ff_simple_idct_armv6;
+ c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
+#endif
+#ifdef HAVE_ARMV5TE
+ } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
+ c->idct_put= simple_idct_put_armv5te;
+ c->idct_add= simple_idct_add_armv5te;
+ c->idct = simple_idct_armv5te;
+ c->idct_permutation_type = FF_NO_IDCT_PERM;
+#endif
+#ifdef HAVE_IPP
+ } else if (idct_algo==FF_IDCT_IPP){
+ c->idct_put= simple_idct_ipp_put;
+ c->idct_add= simple_idct_ipp_add;
+ c->idct = simple_idct_ipp;
+ c->idct_permutation_type= FF_NO_IDCT_PERM;
+#endif
+#ifdef HAVE_NEON
+ } else if (idct_algo==FF_IDCT_SIMPLENEON){
+ c->idct_put= ff_simple_idct_put_neon;
+ c->idct_add= ff_simple_idct_add_neon;
+ c->idct = ff_simple_idct_neon;
+ c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
+#endif
+ }
+ }
+
+ c->put_pixels_tab[0][0] = put_pixels16_arm;
+ c->put_pixels_tab[0][1] = put_pixels16_x2_arm;
+ c->put_pixels_tab[0][2] = put_pixels16_y2_arm;
+ c->put_pixels_tab[0][3] = put_pixels16_xy2_arm;
+ c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm;
+ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm;
+ c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm;
+ c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm;
+ c->put_pixels_tab[1][0] = put_pixels8_arm;
+ c->put_pixels_tab[1][1] = put_pixels8_x2_arm;
+ c->put_pixels_tab[1][2] = put_pixels8_y2_arm;
+ c->put_pixels_tab[1][3] = put_pixels8_xy2_arm;
+ c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;
+ c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm;
+ c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm;
+ c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;
+
+#ifdef HAVE_ARMV5TE
+ c->prefetch = ff_prefetch_arm;
+#endif
+
+#ifdef HAVE_IWMMXT
+ dsputil_init_iwmmxt(c, avctx);
+#endif
+#ifdef HAVE_ARMVFP
+ ff_float_init_arm_vfp(c, avctx);
+#endif
+#ifdef HAVE_NEON
+ ff_dsputil_init_neon(c, avctx);
+#endif
+}
diff --git a/libavcodec/arm/dsputil_arm_s.S b/libavcodec/arm/dsputil_arm_s.S
new file mode 100644
index 0000000000..639b7b8af8
--- /dev/null
+++ b/libavcodec/arm/dsputil_arm_s.S
@@ -0,0 +1,799 @@
+@
+@ ARMv4 optimized DSP utils
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
+@
+@ This file is part of FFmpeg.
+@
+@ FFmpeg is free software; you can redistribute it and/or
+@ modify it under the terms of the GNU Lesser General Public
+@ License as published by the Free Software Foundation; either
+@ version 2.1 of the License, or (at your option) any later version.
+@
+@ FFmpeg is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+@ Lesser General Public License for more details.
+@
+@ You should have received a copy of the GNU Lesser General Public
+@ License along with FFmpeg; if not, write to the Free Software
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+@
+
+#include "config.h"
+#include "asm.S"
+
+ preserve8
+
+#ifndef HAVE_PLD
+.macro pld reg
+.endm
+#endif
+
+#ifdef HAVE_ARMV5TE
+function ff_prefetch_arm, export=1
+ subs r2, r2, #1
+ pld [r0]
+ add r0, r0, r1
+ bne ff_prefetch_arm
+ bx lr
+ .endfunc
+#endif
+
+.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
+ mov \Rd0, \Rn0, lsr #(\shift * 8)
+ mov \Rd1, \Rn1, lsr #(\shift * 8)
+ mov \Rd2, \Rn2, lsr #(\shift * 8)
+ mov \Rd3, \Rn3, lsr #(\shift * 8)
+ orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
+ orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
+ orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
+ orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
+.endm
+.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
+ mov \R0, \R0, lsr #(\shift * 8)
+ orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
+ mov \R1, \R1, lsr #(\shift * 8)
+ orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
+.endm
+.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
+ mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
+ mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
+ orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
+ orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
+.endm
+
+.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
+ @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
+ @ Rmask = 0xFEFEFEFE
+ @ Rn = destroy
+ eor \Rd0, \Rn0, \Rm0
+ eor \Rd1, \Rn1, \Rm1
+ orr \Rn0, \Rn0, \Rm0
+ orr \Rn1, \Rn1, \Rm1
+ and \Rd0, \Rd0, \Rmask
+ and \Rd1, \Rd1, \Rmask
+ sub \Rd0, \Rn0, \Rd0, lsr #1
+ sub \Rd1, \Rn1, \Rd1, lsr #1
+.endm
+
+.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
+ @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
+ @ Rmask = 0xFEFEFEFE
+ @ Rn = destroy
+ eor \Rd0, \Rn0, \Rm0
+ eor \Rd1, \Rn1, \Rm1
+ and \Rn0, \Rn0, \Rm0
+ and \Rn1, \Rn1, \Rm1
+ and \Rd0, \Rd0, \Rmask
+ and \Rd1, \Rd1, \Rmask
+ add \Rd0, \Rn0, \Rd0, lsr #1
+ add \Rd1, \Rn1, \Rd1, lsr #1
+.endm
+
+@ ----------------------------------------------------------------
+ .align 8
+function put_pixels16_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ stmfd sp!, {r4-r11, lr} @ R14 is also called LR
+ adr r5, 5f
+ ands r4, r1, #3
+ bic r1, r1, #3
+ add r5, r5, r4, lsl #2
+ ldrne pc, [r5]
+1:
+ ldmia r1, {r4-r7}
+ add r1, r1, r2
+ stmia r0, {r4-r7}
+ pld [r1]
+ subs r3, r3, #1
+ add r0, r0, r2
+ bne 1b
+ ldmfd sp!, {r4-r11, pc}
+ .align 8
+2:
+ ldmia r1, {r4-r8}
+ add r1, r1, r2
+ ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
+ pld [r1]
+ subs r3, r3, #1
+ stmia r0, {r9-r12}
+ add r0, r0, r2
+ bne 2b
+ ldmfd sp!, {r4-r11, pc}
+ .align 8
+3:
+ ldmia r1, {r4-r8}
+ add r1, r1, r2
+ ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
+ pld [r1]
+ subs r3, r3, #1
+ stmia r0, {r9-r12}
+ add r0, r0, r2
+ bne 3b
+ ldmfd sp!, {r4-r11, pc}
+ .align 8
+4:
+ ldmia r1, {r4-r8}
+ add r1, r1, r2
+ ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
+ pld [r1]
+ subs r3, r3, #1
+ stmia r0, {r9-r12}
+ add r0, r0, r2
+ bne 4b
+ ldmfd sp!, {r4-r11,pc}
+ .align 8
+5:
+ .word 1b
+ .word 2b
+ .word 3b
+ .word 4b
+ .endfunc
+
+@ ----------------------------------------------------------------
+ .align 8
+function put_pixels8_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ stmfd sp!, {r4-r5,lr} @ R14 is also called LR
+ adr r5, 5f
+ ands r4, r1, #3
+ bic r1, r1, #3
+ add r5, r5, r4, lsl #2
+ ldrne pc, [r5]
+1:
+ ldmia r1, {r4-r5}
+ add r1, r1, r2
+ subs r3, r3, #1
+ pld [r1]
+ stmia r0, {r4-r5}
+ add r0, r0, r2
+ bne 1b
+ ldmfd sp!, {r4-r5,pc}
+ .align 8
+2:
+ ldmia r1, {r4-r5, r12}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
+ pld [r1]
+ subs r3, r3, #1
+ stmia r0, {r4-r5}
+ add r0, r0, r2
+ bne 2b
+ ldmfd sp!, {r4-r5,pc}
+ .align 8
+3:
+ ldmia r1, {r4-r5, r12}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
+ pld [r1]
+ subs r3, r3, #1
+ stmia r0, {r4-r5}
+ add r0, r0, r2
+ bne 3b
+ ldmfd sp!, {r4-r5,pc}
+ .align 8
+4:
+ ldmia r1, {r4-r5, r12}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
+ pld [r1]
+ subs r3, r3, #1
+ stmia r0, {r4-r5}
+ add r0, r0, r2
+ bne 4b
+ ldmfd sp!, {r4-r5,pc}
+ .align 8
+5:
+ .word 1b
+ .word 2b
+ .word 3b
+ .word 4b
+ .endfunc
+
+@ ----------------------------------------------------------------
+ .align 8
+function put_pixels8_x2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ stmfd sp!, {r4-r10,lr} @ R14 is also called LR
+ adr r5, 5f
+ ands r4, r1, #3
+ ldr r12, [r5]
+ add r5, r5, r4, lsl #2
+ bic r1, r1, #3
+ ldrne pc, [r5]
+1:
+ ldmia r1, {r4-r5, r10}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ subs r3, r3, #1
+ stmia r0, {r8-r9}
+ add r0, r0, r2
+ bne 1b
+ ldmfd sp!, {r4-r10,pc}
+ .align 8
+2:
+ ldmia r1, {r4-r5, r10}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+ ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stmia r0, {r4-r5}
+ add r0, r0, r2
+ bne 2b
+ ldmfd sp!, {r4-r10,pc}
+ .align 8
+3:
+ ldmia r1, {r4-r5, r10}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
+ ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stmia r0, {r4-r5}
+ add r0, r0, r2
+ bne 3b
+ ldmfd sp!, {r4-r10,pc}
+ .align 8
+4:
+ ldmia r1, {r4-r5, r10}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+ subs r3, r3, #1
+ stmia r0, {r8-r9}
+ add r0, r0, r2
+ bne 4b
+ ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
+ .align 8
+5:
+ .word 0xFEFEFEFE
+ .word 2b
+ .word 3b
+ .word 4b
+ .endfunc
+
+ .align 8
+function put_no_rnd_pixels8_x2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ stmfd sp!, {r4-r10,lr} @ R14 is also called LR
+ adr r5, 5f
+ ands r4, r1, #3
+ ldr r12, [r5]
+ add r5, r5, r4, lsl #2
+ bic r1, r1, #3
+ ldrne pc, [r5]
+1:
+ ldmia r1, {r4-r5, r10}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ subs r3, r3, #1
+ stmia r0, {r8-r9}
+ add r0, r0, r2
+ bne 1b
+ ldmfd sp!, {r4-r10,pc}
+ .align 8
+2:
+ ldmia r1, {r4-r5, r10}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
+ ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stmia r0, {r4-r5}
+ add r0, r0, r2
+ bne 2b
+ ldmfd sp!, {r4-r10,pc}
+ .align 8
+3:
+ ldmia r1, {r4-r5, r10}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
+ ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stmia r0, {r4-r5}
+ add r0, r0, r2
+ bne 3b
+ ldmfd sp!, {r4-r10,pc}
+ .align 8
+4:
+ ldmia r1, {r4-r5, r10}
+ add r1, r1, r2
+ ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+ subs r3, r3, #1
+ stmia r0, {r8-r9}
+ add r0, r0, r2
+ bne 4b
+ ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
+ .align 8
+5:
+ .word 0xFEFEFEFE
+ .word 2b
+ .word 3b
+ .word 4b
+ .endfunc
+
+
+@ ----------------------------------------------------------------
+ .align 8
+function put_pixels8_y2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+ adr r5, 5f
+ ands r4, r1, #3
+ mov r3, r3, lsr #1
+ ldr r12, [r5]
+ add r5, r5, r4, lsl #2
+ bic r1, r1, #3
+ ldrne pc, [r5]
+1:
+ ldmia r1, {r4-r5}
+ add r1, r1, r2
+6: ldmia r1, {r6-r7}
+ add r1, r1, r2
+ pld [r1]
+ RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ ldmia r1, {r4-r5}
+ add r1, r1, r2
+ stmia r0, {r8-r9}
+ add r0, r0, r2
+ pld [r1]
+ RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+ subs r3, r3, #1
+ stmia r0, {r8-r9}
+ add r0, r0, r2
+ bne 6b
+ ldmfd sp!, {r4-r11,pc}
+ .align 8
+2:
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+6: ldmia r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+ subs r3, r3, #1
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ ldmfd sp!, {r4-r11,pc}
+ .align 8
+3:
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+6: ldmia r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+ subs r3, r3, #1
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ ldmfd sp!, {r4-r11,pc}
+ .align 8
+4:
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+6: ldmia r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+ subs r3, r3, #1
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ ldmfd sp!, {r4-r11,pc}
+
+ .align 8
+5:
+ .word 0xFEFEFEFE
+ .word 2b
+ .word 3b
+ .word 4b
+ .endfunc
+
+ .align 8
+function put_no_rnd_pixels8_y2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+ adr r5, 5f
+ ands r4, r1, #3
+ mov r3, r3, lsr #1
+ ldr r12, [r5]
+ add r5, r5, r4, lsl #2
+ bic r1, r1, #3
+ ldrne pc, [r5]
+1:
+ ldmia r1, {r4-r5}
+ add r1, r1, r2
+6: ldmia r1, {r6-r7}
+ add r1, r1, r2
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ ldmia r1, {r4-r5}
+ add r1, r1, r2
+ stmia r0, {r8-r9}
+ add r0, r0, r2
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+ subs r3, r3, #1
+ stmia r0, {r8-r9}
+ add r0, r0, r2
+ bne 6b
+ ldmfd sp!, {r4-r11,pc}
+ .align 8
+2:
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+6: ldmia r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
+ subs r3, r3, #1
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ ldmfd sp!, {r4-r11,pc}
+ .align 8
+3:
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+6: ldmia r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
+ subs r3, r3, #1
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ ldmfd sp!, {r4-r11,pc}
+ .align 8
+4:
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+6: ldmia r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ ldmia r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
+ subs r3, r3, #1
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stmia r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ ldmfd sp!, {r4-r11,pc}
+ .align 8
+5:
+ .word 0xFEFEFEFE
+ .word 2b
+ .word 3b
+ .word 4b
+ .endfunc
+
+@ ----------------------------------------------------------------
+.macro RND_XY2_IT align
+ @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
+ @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
+.if \align == 0
+ ldmia r1, {r6-r8}
+.elseif \align == 3
+ ldmia r1, {r5-r7}
+.else
+ ldmia r1, {r8-r10}
+.endif
+ add r1, r1, r2
+ pld [r1]
+.if \align == 0
+ ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
+.elseif \align == 1
+ ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
+ ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
+.elseif \align == 2
+ ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
+ ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
+.elseif \align == 3
+ ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
+.endif
+ ldr r14, [r12, #0] @ 0x03030303
+ tst r3, #1
+ and r8, r4, r14
+ and r9, r5, r14
+ and r10, r6, r14
+ and r11, r7, r14
+ ldreq r14, [r12, #16] @ 0x02020202/0x01010101
+ add r8, r8, r10
+ add r9, r9, r11
+ addeq r8, r8, r14
+ addeq r9, r9, r14
+ ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2
+ and r4, r14, r4, lsr #2
+ and r5, r14, r5, lsr #2
+ and r6, r14, r6, lsr #2
+ and r7, r14, r7, lsr #2
+ add r10, r4, r6
+ add r11, r5, r7
+ subs r3, r3, #1
+.endm
+
+.macro RND_XY2_EXPAND align
+ RND_XY2_IT \align
+6: stmfd sp!, {r8-r11}
+ RND_XY2_IT \align
+ ldmfd sp!, {r4-r7}
+ add r4, r4, r8
+ add r5, r5, r9
+ add r6, r6, r10
+ add r7, r7, r11
+ ldr r14, [r12, #24] @ 0x0F0F0F0F
+ and r4, r14, r4, lsr #2
+ and r5, r14, r5, lsr #2
+ add r4, r4, r6
+ add r5, r5, r7
+ stmia r0, {r4-r5}
+ add r0, r0, r2
+ bge 6b
+ ldmfd sp!, {r4-r11,pc}
+.endm
+
+ .align 8
+function put_pixels8_xy2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+ adrl r12, 5f
+ ands r4, r1, #3
+ add r5, r12, r4, lsl #2
+ bic r1, r1, #3
+ ldrne pc, [r5]
+1:
+ RND_XY2_EXPAND 0
+
+ .align 8
+2:
+ RND_XY2_EXPAND 1
+
+ .align 8
+3:
+ RND_XY2_EXPAND 2
+
+ .align 8
+4:
+ RND_XY2_EXPAND 3
+
+5:
+ .word 0x03030303
+ .word 2b
+ .word 3b
+ .word 4b
+ .word 0x02020202
+ .word 0xFCFCFCFC >> 2
+ .word 0x0F0F0F0F
+ .endfunc
+
+ .align 8
+function put_no_rnd_pixels8_xy2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ stmfd sp!, {r4-r11,lr} @ R14 is also called LR
+ adrl r12, 5f
+ ands r4, r1, #3
+ add r5, r12, r4, lsl #2
+ bic r1, r1, #3
+ ldrne pc, [r5]
+1:
+ RND_XY2_EXPAND 0
+
+ .align 8
+2:
+ RND_XY2_EXPAND 1
+
+ .align 8
+3:
+ RND_XY2_EXPAND 2
+
+ .align 8
+4:
+ RND_XY2_EXPAND 3
+
+5:
+ .word 0x03030303
+ .word 2b
+ .word 3b
+ .word 4b
+ .word 0x01010101
+ .word 0xFCFCFCFC >> 2
+ .word 0x0F0F0F0F
+ .endfunc
+
+@ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride)
+function ff_add_pixels_clamped_ARM, export=1
+ push {r4-r10}
+ mov r10, #8
+1:
+ ldr r4, [r1] /* load dest */
+ /* block[0] and block[1]*/
+ ldrsh r5, [r0]
+ ldrsh r7, [r0, #2]
+ and r6, r4, #0xFF
+ and r8, r4, #0xFF00
+ add r6, r5, r6
+ add r8, r7, r8, lsr #8
+ mvn r5, r5
+ mvn r7, r7
+ tst r6, #0x100
+ movne r6, r5, lsr #24
+ tst r8, #0x100
+ movne r8, r7, lsr #24
+ mov r9, r6
+ ldrsh r5, [r0, #4] /* moved form [A] */
+ orr r9, r9, r8, lsl #8
+ /* block[2] and block[3] */
+ /* [A] */
+ ldrsh r7, [r0, #6]
+ and r6, r4, #0xFF0000
+ and r8, r4, #0xFF000000
+ add r6, r5, r6, lsr #16
+ add r8, r7, r8, lsr #24
+ mvn r5, r5
+ mvn r7, r7
+ tst r6, #0x100
+ movne r6, r5, lsr #24
+ tst r8, #0x100
+ movne r8, r7, lsr #24
+ orr r9, r9, r6, lsl #16
+ ldr r4, [r1, #4] /* moved form [B] */
+ orr r9, r9, r8, lsl #24
+ /* store dest */
+ ldrsh r5, [r0, #8] /* moved form [C] */
+ str r9, [r1]
+
+ /* load dest */
+ /* [B] */
+ /* block[4] and block[5] */
+ /* [C] */
+ ldrsh r7, [r0, #10]
+ and r6, r4, #0xFF
+ and r8, r4, #0xFF00
+ add r6, r5, r6
+ add r8, r7, r8, lsr #8
+ mvn r5, r5
+ mvn r7, r7
+ tst r6, #0x100
+ movne r6, r5, lsr #24
+ tst r8, #0x100
+ movne r8, r7, lsr #24
+ mov r9, r6
+ ldrsh r5, [r0, #12] /* moved from [D] */
+ orr r9, r9, r8, lsl #8
+ /* block[6] and block[7] */
+ /* [D] */
+ ldrsh r7, [r0, #14]
+ and r6, r4, #0xFF0000
+ and r8, r4, #0xFF000000
+ add r6, r5, r6, lsr #16
+ add r8, r7, r8, lsr #24
+ mvn r5, r5
+ mvn r7, r7
+ tst r6, #0x100
+ movne r6, r5, lsr #24
+ tst r8, #0x100
+ movne r8, r7, lsr #24
+ orr r9, r9, r6, lsl #16
+ add r0, r0, #16 /* moved from [E] */
+ orr r9, r9, r8, lsl #24
+ subs r10, r10, #1 /* moved from [F] */
+ /* store dest */
+ str r9, [r1, #4]
+
+ /* [E] */
+ /* [F] */
+ add r1, r1, r2
+ bne 1b
+
+ pop {r4-r10}
+ bx lr
+ .endfunc
diff --git a/libavcodec/arm/dsputil_iwmmxt.c b/libavcodec/arm/dsputil_iwmmxt.c
new file mode 100644
index 0000000000..3d6260b056
--- /dev/null
+++ b/libavcodec/arm/dsputil_iwmmxt.c
@@ -0,0 +1,205 @@
+/*
+ * iWMMXt optimized DSP utils
+ * Copyright (c) 2004 AGAWA Koji
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+
+#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
+#define SET_RND(regd) __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
+#define WAVG2B "wavg2b"
+#include "dsputil_iwmmxt_rnd_template.c"
+#undef DEF
+#undef SET_RND
+#undef WAVG2B
+
+#define DEF(x, y) x ## _ ## y ##_iwmmxt
+#define SET_RND(regd) __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
+#define WAVG2B "wavg2br"
+#include "dsputil_iwmmxt_rnd_template.c"
+#undef DEF
+#undef SET_RND
+#undef WAVG2BR
+
+// need scheduling
+#define OP(AVG) \
+ __asm__ volatile ( \
+ /* alignment */ \
+ "and r12, %[pixels], #7 \n\t" \
+ "bic %[pixels], %[pixels], #7 \n\t" \
+ "tmcr wcgr1, r12 \n\t" \
+ \
+ "wldrd wr0, [%[pixels]] \n\t" \
+ "wldrd wr1, [%[pixels], #8] \n\t" \
+ "add %[pixels], %[pixels], %[line_size] \n\t" \
+ "walignr1 wr4, wr0, wr1 \n\t" \
+ \
+ "1: \n\t" \
+ \
+ "wldrd wr2, [%[pixels]] \n\t" \
+ "wldrd wr3, [%[pixels], #8] \n\t" \
+ "add %[pixels], %[pixels], %[line_size] \n\t" \
+ "pld [%[pixels]] \n\t" \
+ "walignr1 wr5, wr2, wr3 \n\t" \
+ AVG " wr6, wr4, wr5 \n\t" \
+ "wstrd wr6, [%[block]] \n\t" \
+ "add %[block], %[block], %[line_size] \n\t" \
+ \
+ "wldrd wr0, [%[pixels]] \n\t" \
+ "wldrd wr1, [%[pixels], #8] \n\t" \
+ "add %[pixels], %[pixels], %[line_size] \n\t" \
+ "walignr1 wr4, wr0, wr1 \n\t" \
+ "pld [%[pixels]] \n\t" \
+ AVG " wr6, wr4, wr5 \n\t" \
+ "wstrd wr6, [%[block]] \n\t" \
+ "add %[block], %[block], %[line_size] \n\t" \
+ \
+ "subs %[h], %[h], #2 \n\t" \
+ "bne 1b \n\t" \
+ : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \
+ : [line_size]"r"(line_size) \
+ : "memory", "r12");
+void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ OP("wavg2br");
+}
+void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ OP("wavg2b");
+}
+#undef OP
+
+void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+ uint8_t *pixels2 = pixels + line_size;
+
+ __asm__ volatile (
+ "mov r12, #4 \n\t"
+ "1: \n\t"
+ "pld [%[pixels], %[line_size2]] \n\t"
+ "pld [%[pixels2], %[line_size2]] \n\t"
+ "wldrd wr4, [%[pixels]] \n\t"
+ "wldrd wr5, [%[pixels2]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "wunpckelub wr6, wr4 \n\t"
+ "wldrd wr0, [%[block]] \n\t"
+ "wunpckehub wr7, wr4 \n\t"
+ "wldrd wr1, [%[block], #8] \n\t"
+ "wunpckelub wr8, wr5 \n\t"
+ "wldrd wr2, [%[block], #16] \n\t"
+ "wunpckehub wr9, wr5 \n\t"
+ "wldrd wr3, [%[block], #24] \n\t"
+ "add %[block], %[block], #32 \n\t"
+ "waddhss wr10, wr0, wr6 \n\t"
+ "waddhss wr11, wr1, wr7 \n\t"
+ "waddhss wr12, wr2, wr8 \n\t"
+ "waddhss wr13, wr3, wr9 \n\t"
+ "wpackhus wr14, wr10, wr11 \n\t"
+ "wpackhus wr15, wr12, wr13 \n\t"
+ "wstrd wr14, [%[pixels]] \n\t"
+ "add %[pixels], %[pixels], %[line_size2] \n\t"
+ "subs r12, r12, #1 \n\t"
+ "wstrd wr15, [%[pixels2]] \n\t"
+ "add %[pixels2], %[pixels2], %[line_size2] \n\t"
+ "bne 1b \n\t"
+ : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
+ : [line_size2]"r"(line_size << 1)
+ : "cc", "memory", "r12");
+}
+
+static void clear_blocks_iwmmxt(DCTELEM *blocks)
+{
+ __asm__ volatile(
+ "wzero wr0 \n\t"
+ "mov r1, #(128 * 6 / 32) \n\t"
+ "1: \n\t"
+ "wstrd wr0, [%0] \n\t"
+ "wstrd wr0, [%0, #8] \n\t"
+ "wstrd wr0, [%0, #16] \n\t"
+ "wstrd wr0, [%0, #24] \n\t"
+ "subs r1, r1, #1 \n\t"
+ "add %0, %0, #32 \n\t"
+ "bne 1b \n\t"
+ : "+r"(blocks)
+ :
+ : "r1"
+ );
+}
+
+static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ return;
+}
+
+/* A run time test is not simple. If this file is compiled in
+ * then we should install the functions
+ */
+int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */
+
+void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
+{
+ if (avctx->dsp_mask) {
+ if (avctx->dsp_mask & FF_MM_FORCE)
+ mm_flags |= (avctx->dsp_mask & 0xffff);
+ else
+ mm_flags &= ~(avctx->dsp_mask & 0xffff);
+ }
+
+ if (!(mm_flags & FF_MM_IWMMXT)) return;
+
+ c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
+
+ c->clear_blocks = clear_blocks_iwmmxt;
+
+ c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
+ c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
+ c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
+ c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
+ c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
+ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
+ c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
+ c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
+
+ c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
+ c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
+ c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
+ c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
+ c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
+ c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
+ c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
+ c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
+
+ c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
+ c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
+ c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
+ c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
+ c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
+
+ c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
+ c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
+ c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
+ c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
+ c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
+}
diff --git a/libavcodec/arm/dsputil_iwmmxt_rnd_template.c b/libavcodec/arm/dsputil_iwmmxt_rnd_template.c
new file mode 100644
index 0000000000..35a5a9b8b4
--- /dev/null
+++ b/libavcodec/arm/dsputil_iwmmxt_rnd_template.c
@@ -0,0 +1,1114 @@
+/*
+ * iWMMXt optimized DSP utils
+ * copyright (c) 2004 AGAWA Koji
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ __asm__ volatile (
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "add r4, %[pixels], %[line_size] \n\t"
+ "add r5, %[block], %[line_size] \n\t"
+ "mov %[line_size], %[line_size], lsl #1 \n\t"
+ "1: \n\t"
+ "wldrd wr0, [%[pixels]] \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ "wldrd wr1, [%[pixels], #8] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "wldrd wr3, [r4] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "wldrd wr4, [r4, #8] \n\t"
+ "add r4, r4, %[line_size] \n\t"
+ "walignr1 wr8, wr0, wr1 \n\t"
+ "pld [r4] \n\t"
+ "pld [r4, #32] \n\t"
+ "walignr1 wr10, wr3, wr4 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "wstrd wr10, [r5] \n\t"
+ "add r5, r5, %[line_size] \n\t"
+ "bne 1b \n\t"
+ : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+ :
+ : "memory", "r4", "r5", "r12");
+}
+
+void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ __asm__ volatile (
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "add r4, %[pixels], %[line_size] \n\t"
+ "add r5, %[block], %[line_size] \n\t"
+ "mov %[line_size], %[line_size], lsl #1 \n\t"
+ "1: \n\t"
+ "wldrd wr0, [%[pixels]] \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ "wldrd wr1, [%[pixels], #8] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "wldrd wr3, [r4] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "wldrd wr4, [r4, #8] \n\t"
+ "add r4, r4, %[line_size] \n\t"
+ "walignr1 wr8, wr0, wr1 \n\t"
+ "wldrd wr0, [%[block]] \n\t"
+ "wldrd wr2, [r5] \n\t"
+ "pld [r4] \n\t"
+ "pld [r4, #32] \n\t"
+ "walignr1 wr10, wr3, wr4 \n\t"
+ WAVG2B" wr8, wr8, wr0 \n\t"
+ WAVG2B" wr10, wr10, wr2 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "wstrd wr10, [r5] \n\t"
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "add r5, r5, %[line_size] \n\t"
+ "pld [r5] \n\t"
+ "pld [r5, #32] \n\t"
+ "bne 1b \n\t"
+ : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+ :
+ : "memory", "r4", "r5", "r12");
+}
+
+void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ __asm__ volatile (
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "add r4, %[pixels], %[line_size] \n\t"
+ "add r5, %[block], %[line_size] \n\t"
+ "mov %[line_size], %[line_size], lsl #1 \n\t"
+ "1: \n\t"
+ "wldrd wr0, [%[pixels]] \n\t"
+ "wldrd wr1, [%[pixels], #8] \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ "wldrd wr2, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "wldrd wr3, [r4] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr8, wr0, wr1 \n\t"
+ "wldrd wr4, [r4, #8] \n\t"
+ "walignr1 wr9, wr1, wr2 \n\t"
+ "wldrd wr5, [r4, #16] \n\t"
+ "add r4, r4, %[line_size] \n\t"
+ "pld [r4] \n\t"
+ "pld [r4, #32] \n\t"
+ "walignr1 wr10, wr3, wr4 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "walignr1 wr11, wr4, wr5 \n\t"
+ "wstrd wr9, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "wstrd wr10, [r5] \n\t"
+ "wstrd wr11, [r5, #8] \n\t"
+ "add r5, r5, %[line_size] \n\t"
+ "bne 1b \n\t"
+ : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+ :
+ : "memory", "r4", "r5", "r12");
+}
+
+void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ __asm__ volatile (
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "add r4, %[pixels], %[line_size]\n\t"
+ "add r5, %[block], %[line_size] \n\t"
+ "mov %[line_size], %[line_size], lsl #1 \n\t"
+ "1: \n\t"
+ "wldrd wr0, [%[pixels]] \n\t"
+ "wldrd wr1, [%[pixels], #8] \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ "wldrd wr2, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "wldrd wr3, [r4] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr8, wr0, wr1 \n\t"
+ "wldrd wr4, [r4, #8] \n\t"
+ "walignr1 wr9, wr1, wr2 \n\t"
+ "wldrd wr5, [r4, #16] \n\t"
+ "add r4, r4, %[line_size] \n\t"
+ "wldrd wr0, [%[block]] \n\t"
+ "pld [r4] \n\t"
+ "wldrd wr1, [%[block], #8] \n\t"
+ "pld [r4, #32] \n\t"
+ "wldrd wr2, [r5] \n\t"
+ "walignr1 wr10, wr3, wr4 \n\t"
+ "wldrd wr3, [r5, #8] \n\t"
+ WAVG2B" wr8, wr8, wr0 \n\t"
+ WAVG2B" wr9, wr9, wr1 \n\t"
+ WAVG2B" wr10, wr10, wr2 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "walignr1 wr11, wr4, wr5 \n\t"
+ WAVG2B" wr11, wr11, wr3 \n\t"
+ "wstrd wr9, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "wstrd wr10, [r5] \n\t"
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "wstrd wr11, [r5, #8] \n\t"
+ "add r5, r5, %[line_size] \n\t"
+ "pld [r5] \n\t"
+ "pld [r5, #32] \n\t"
+ "bne 1b \n\t"
+ : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
+ :
+ : "memory", "r4", "r5", "r12");
+}
+
+void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "add r12, r12, #1 \n\t"
+ "add r4, %[pixels], %[line_size]\n\t"
+ "tmcr wcgr2, r12 \n\t"
+ "add r5, %[block], %[line_size] \n\t"
+ "mov %[line_size], %[line_size], lsl #1 \n\t"
+
+ "1: \n\t"
+ "wldrd wr10, [%[pixels]] \n\t"
+ "cmp r12, #8 \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "wldrd wr13, [r4] \n\t"
+ "pld [%[pixels]] \n\t"
+ "wldrd wr14, [r4, #8] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "add r4, r4, %[line_size] \n\t"
+ "walignr1 wr0, wr10, wr11 \n\t"
+ "pld [r4] \n\t"
+ "pld [r4, #32] \n\t"
+ "walignr1 wr2, wr13, wr14 \n\t"
+ "wmoveq wr4, wr11 \n\t"
+ "wmoveq wr6, wr14 \n\t"
+ "walignr2ne wr4, wr10, wr11 \n\t"
+ "walignr2ne wr6, wr13, wr14 \n\t"
+ WAVG2B" wr0, wr0, wr4 \n\t"
+ WAVG2B" wr2, wr2, wr6 \n\t"
+ "wstrd wr0, [%[block]] \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ "wstrd wr2, [r5] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "add r5, r5, %[line_size] \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+ :
+ : "r4", "r5", "r12", "memory");
+}
+
+void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "add r12, r12, #1 \n\t"
+ "add r4, %[pixels], %[line_size]\n\t"
+ "tmcr wcgr2, r12 \n\t"
+ "add r5, %[block], %[line_size] \n\t"
+ "mov %[line_size], %[line_size], lsl #1 \n\t"
+
+ "1: \n\t"
+ "wldrd wr10, [%[pixels]] \n\t"
+ "cmp r12, #8 \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "wldrd wr12, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "wldrd wr13, [r4] \n\t"
+ "pld [%[pixels]] \n\t"
+ "wldrd wr14, [r4, #8] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "wldrd wr15, [r4, #16] \n\t"
+ "add r4, r4, %[line_size] \n\t"
+ "walignr1 wr0, wr10, wr11 \n\t"
+ "pld [r4] \n\t"
+ "pld [r4, #32] \n\t"
+ "walignr1 wr1, wr11, wr12 \n\t"
+ "walignr1 wr2, wr13, wr14 \n\t"
+ "walignr1 wr3, wr14, wr15 \n\t"
+ "wmoveq wr4, wr11 \n\t"
+ "wmoveq wr5, wr12 \n\t"
+ "wmoveq wr6, wr14 \n\t"
+ "wmoveq wr7, wr15 \n\t"
+ "walignr2ne wr4, wr10, wr11 \n\t"
+ "walignr2ne wr5, wr11, wr12 \n\t"
+ "walignr2ne wr6, wr13, wr14 \n\t"
+ "walignr2ne wr7, wr14, wr15 \n\t"
+ WAVG2B" wr0, wr0, wr4 \n\t"
+ WAVG2B" wr1, wr1, wr5 \n\t"
+ "wstrd wr0, [%[block]] \n\t"
+ WAVG2B" wr2, wr2, wr6 \n\t"
+ "wstrd wr1, [%[block], #8] \n\t"
+ WAVG2B" wr3, wr3, wr7 \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "wstrd wr2, [r5] \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ "wstrd wr3, [r5, #8] \n\t"
+ "add r5, r5, %[line_size] \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+ :
+ : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "add r12, r12, #1 \n\t"
+ "add r4, %[pixels], %[line_size]\n\t"
+ "tmcr wcgr2, r12 \n\t"
+ "add r5, %[block], %[line_size] \n\t"
+ "mov %[line_size], %[line_size], lsl #1 \n\t"
+ "pld [r5] \n\t"
+ "pld [r5, #32] \n\t"
+
+ "1: \n\t"
+ "wldrd wr10, [%[pixels]] \n\t"
+ "cmp r12, #8 \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "wldrd wr13, [r4] \n\t"
+ "pld [%[pixels]] \n\t"
+ "wldrd wr14, [r4, #8] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "add r4, r4, %[line_size] \n\t"
+ "walignr1 wr0, wr10, wr11 \n\t"
+ "pld [r4] \n\t"
+ "pld [r4, #32] \n\t"
+ "walignr1 wr2, wr13, wr14 \n\t"
+ "wmoveq wr4, wr11 \n\t"
+ "wmoveq wr6, wr14 \n\t"
+ "walignr2ne wr4, wr10, wr11 \n\t"
+ "wldrd wr10, [%[block]] \n\t"
+ "walignr2ne wr6, wr13, wr14 \n\t"
+ "wldrd wr12, [r5] \n\t"
+ WAVG2B" wr0, wr0, wr4 \n\t"
+ WAVG2B" wr2, wr2, wr6 \n\t"
+ WAVG2B" wr0, wr0, wr10 \n\t"
+ WAVG2B" wr2, wr2, wr12 \n\t"
+ "wstrd wr0, [%[block]] \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ "wstrd wr2, [r5] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "add r5, r5, %[line_size] \n\t"
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "pld [r5] \n\t"
+ "pld [r5, #32] \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+ :
+ : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "add r12, r12, #1 \n\t"
+ "add r4, %[pixels], %[line_size]\n\t"
+ "tmcr wcgr2, r12 \n\t"
+ "add r5, %[block], %[line_size] \n\t"
+ "mov %[line_size], %[line_size], lsl #1 \n\t"
+ "pld [r5] \n\t"
+ "pld [r5, #32] \n\t"
+
+ "1: \n\t"
+ "wldrd wr10, [%[pixels]] \n\t"
+ "cmp r12, #8 \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "wldrd wr12, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "wldrd wr13, [r4] \n\t"
+ "pld [%[pixels]] \n\t"
+ "wldrd wr14, [r4, #8] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "wldrd wr15, [r4, #16] \n\t"
+ "add r4, r4, %[line_size] \n\t"
+ "walignr1 wr0, wr10, wr11 \n\t"
+ "pld [r4] \n\t"
+ "pld [r4, #32] \n\t"
+ "walignr1 wr1, wr11, wr12 \n\t"
+ "walignr1 wr2, wr13, wr14 \n\t"
+ "walignr1 wr3, wr14, wr15 \n\t"
+ "wmoveq wr4, wr11 \n\t"
+ "wmoveq wr5, wr12 \n\t"
+ "wmoveq wr6, wr14 \n\t"
+ "wmoveq wr7, wr15 \n\t"
+ "walignr2ne wr4, wr10, wr11 \n\t"
+ "walignr2ne wr5, wr11, wr12 \n\t"
+ "walignr2ne wr6, wr13, wr14 \n\t"
+ "walignr2ne wr7, wr14, wr15 \n\t"
+ "wldrd wr10, [%[block]] \n\t"
+ WAVG2B" wr0, wr0, wr4 \n\t"
+ "wldrd wr11, [%[block], #8] \n\t"
+ WAVG2B" wr1, wr1, wr5 \n\t"
+ "wldrd wr12, [r5] \n\t"
+ WAVG2B" wr2, wr2, wr6 \n\t"
+ "wldrd wr13, [r5, #8] \n\t"
+ WAVG2B" wr3, wr3, wr7 \n\t"
+ WAVG2B" wr0, wr0, wr10 \n\t"
+ WAVG2B" wr1, wr1, wr11 \n\t"
+ WAVG2B" wr2, wr2, wr12 \n\t"
+ WAVG2B" wr3, wr3, wr13 \n\t"
+ "wstrd wr0, [%[block]] \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ "wstrd wr1, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "wstrd wr2, [r5] \n\t"
+ "pld [%[block]] \n\t"
+ "wstrd wr3, [r5, #8] \n\t"
+ "add r5, r5, %[line_size] \n\t"
+ "pld [%[block], #32] \n\t"
+ "pld [r5] \n\t"
+ "pld [r5, #32] \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+ :
+ :"r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ __asm__ volatile(
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "and r12, %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+
+ "wldrd wr10, [%[pixels]] \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "pld [%[block]] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "walignr1 wr0, wr10, wr11 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+
+ "1: \n\t"
+ "wldrd wr10, [%[pixels]] \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr4, wr10, wr11 \n\t"
+ "wldrd wr10, [%[block]] \n\t"
+ WAVG2B" wr8, wr0, wr4 \n\t"
+ WAVG2B" wr8, wr8, wr10 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+
+ "wldrd wr10, [%[pixels]] \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "pld [%[block]] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr0, wr10, wr11 \n\t"
+ "wldrd wr10, [%[block]] \n\t"
+ WAVG2B" wr8, wr0, wr4 \n\t"
+ WAVG2B" wr8, wr8, wr10 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+
+ "subs %[h], %[h], #2 \n\t"
+ "pld [%[block]] \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+ :
+ : "cc", "memory", "r12");
+}
+
+void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ __asm__ volatile(
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "and r12, %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+
+ "wldrd wr10, [%[pixels]] \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "wldrd wr12, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr0, wr10, wr11 \n\t"
+ "walignr1 wr1, wr11, wr12 \n\t"
+
+ "1: \n\t"
+ "wldrd wr10, [%[pixels]] \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "wldrd wr12, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr4, wr10, wr11 \n\t"
+ "walignr1 wr5, wr11, wr12 \n\t"
+ WAVG2B" wr8, wr0, wr4 \n\t"
+ WAVG2B" wr9, wr1, wr5 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "wstrd wr9, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+
+ "wldrd wr10, [%[pixels]] \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "wldrd wr12, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr0, wr10, wr11 \n\t"
+ "walignr1 wr1, wr11, wr12 \n\t"
+ WAVG2B" wr8, wr0, wr4 \n\t"
+ WAVG2B" wr9, wr1, wr5 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "wstrd wr9, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+
+ "subs %[h], %[h], #2 \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+ :
+ : "r4", "r5", "r12", "memory");
+}
+
+void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ int stride = line_size;
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ __asm__ volatile(
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "and r12, %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+
+ "wldrd wr10, [%[pixels]] \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "pld [%[block]] \n\t"
+ "wldrd wr12, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr0, wr10, wr11 \n\t"
+ "walignr1 wr1, wr11, wr12 \n\t"
+
+ "1: \n\t"
+ "wldrd wr10, [%[pixels]] \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "wldrd wr12, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr4, wr10, wr11 \n\t"
+ "walignr1 wr5, wr11, wr12 \n\t"
+ "wldrd wr10, [%[block]] \n\t"
+ "wldrd wr11, [%[block], #8] \n\t"
+ WAVG2B" wr8, wr0, wr4 \n\t"
+ WAVG2B" wr9, wr1, wr5 \n\t"
+ WAVG2B" wr8, wr8, wr10 \n\t"
+ WAVG2B" wr9, wr9, wr11 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "wstrd wr9, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+
+ "wldrd wr10, [%[pixels]] \n\t"
+ "wldrd wr11, [%[pixels], #8] \n\t"
+ "pld [%[block]] \n\t"
+ "wldrd wr12, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr0, wr10, wr11 \n\t"
+ "walignr1 wr1, wr11, wr12 \n\t"
+ "wldrd wr10, [%[block]] \n\t"
+ "wldrd wr11, [%[block], #8] \n\t"
+ WAVG2B" wr8, wr0, wr4 \n\t"
+ WAVG2B" wr9, wr1, wr5 \n\t"
+ WAVG2B" wr8, wr8, wr10 \n\t"
+ WAVG2B" wr9, wr9, wr11 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "wstrd wr9, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+
+ "subs %[h], %[h], #2 \n\t"
+ "pld [%[block]] \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
+ :
+ : "r4", "r5", "r12", "memory");
+}
+
+void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "pld [%[pixels]] \n\t"
+ "mov r12, #2 \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "tmcr wcgr0, r12 \n\t" /* for shift value */
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+
+ // [wr0 wr1 wr2 wr3] <= *
+ // [wr4 wr5 wr6 wr7]
+ "wldrd wr12, [%[pixels]] \n\t"
+ "add r12, r12, #1 \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "tmcr wcgr2, r12 \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "cmp r12, #8 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr2, wr12, wr13 \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "wunpckelub wr0, wr2 \n\t"
+ "wunpckehub wr1, wr2 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "waddhus wr0, wr0, wr8 \n\t"
+ "waddhus wr1, wr1, wr9 \n\t"
+
+ "1: \n\t"
+ // [wr0 wr1 wr2 wr3]
+ // [wr4 wr5 wr6 wr7] <= *
+ "wldrd wr12, [%[pixels]] \n\t"
+ "cmp r12, #8 \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "walignr1 wr6, wr12, wr13 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "wunpckelub wr4, wr6 \n\t"
+ "wunpckehub wr5, wr6 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "waddhus wr4, wr4, wr8 \n\t"
+ "waddhus wr5, wr5, wr9 \n\t"
+ "waddhus wr8, wr0, wr4 \n\t"
+ "waddhus wr9, wr1, wr5 \n\t"
+ "waddhus wr8, wr8, wr15 \n\t"
+ "waddhus wr9, wr9, wr15 \n\t"
+ "wsrlhg wr8, wr8, wcgr0 \n\t"
+ "wsrlhg wr9, wr9, wcgr0 \n\t"
+ "wpackhus wr8, wr8, wr9 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+
+ // [wr0 wr1 wr2 wr3] <= *
+ // [wr4 wr5 wr6 wr7]
+ "wldrd wr12, [%[pixels]] \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "walignr1 wr2, wr12, wr13 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "wunpckelub wr0, wr2 \n\t"
+ "wunpckehub wr1, wr2 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "waddhus wr0, wr0, wr8 \n\t"
+ "waddhus wr1, wr1, wr9 \n\t"
+ "waddhus wr8, wr0, wr4 \n\t"
+ "waddhus wr9, wr1, wr5 \n\t"
+ "waddhus wr8, wr8, wr15 \n\t"
+ "waddhus wr9, wr9, wr15 \n\t"
+ "wsrlhg wr8, wr8, wcgr0 \n\t"
+ "wsrlhg wr9, wr9, wcgr0 \n\t"
+ "wpackhus wr8, wr8, wr9 \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+ : [line_size]"r"(line_size)
+ : "r12", "memory");
+}
+
+void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "pld [%[pixels]] \n\t"
+ "mov r12, #2 \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "tmcr wcgr0, r12 \n\t" /* for shift value */
+ /* alignment */
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "add r12, r12, #1 \n\t"
+ "tmcr wcgr2, r12 \n\t"
+
+ // [wr0 wr1 wr2 wr3] <= *
+ // [wr4 wr5 wr6 wr7]
+ "wldrd wr12, [%[pixels]] \n\t"
+ "cmp r12, #8 \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "wldrd wr14, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "pld [%[pixels]] \n\t"
+ "walignr1 wr2, wr12, wr13 \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr3, wr13, wr14 \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "wmoveq wr11, wr14 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "walignr2ne wr11, wr13, wr14 \n\t"
+ "wunpckelub wr0, wr2 \n\t"
+ "wunpckehub wr1, wr2 \n\t"
+ "wunpckelub wr2, wr3 \n\t"
+ "wunpckehub wr3, wr3 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "wunpckelub wr10, wr11 \n\t"
+ "wunpckehub wr11, wr11 \n\t"
+ "waddhus wr0, wr0, wr8 \n\t"
+ "waddhus wr1, wr1, wr9 \n\t"
+ "waddhus wr2, wr2, wr10 \n\t"
+ "waddhus wr3, wr3, wr11 \n\t"
+
+ "1: \n\t"
+ // [wr0 wr1 wr2 wr3]
+ // [wr4 wr5 wr6 wr7] <= *
+ "wldrd wr12, [%[pixels]] \n\t"
+ "cmp r12, #8 \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "wldrd wr14, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "walignr1 wr6, wr12, wr13 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr7, wr13, wr14 \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "wmoveq wr11, wr14 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "walignr2ne wr11, wr13, wr14 \n\t"
+ "wunpckelub wr4, wr6 \n\t"
+ "wunpckehub wr5, wr6 \n\t"
+ "wunpckelub wr6, wr7 \n\t"
+ "wunpckehub wr7, wr7 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "wunpckelub wr10, wr11 \n\t"
+ "wunpckehub wr11, wr11 \n\t"
+ "waddhus wr4, wr4, wr8 \n\t"
+ "waddhus wr5, wr5, wr9 \n\t"
+ "waddhus wr6, wr6, wr10 \n\t"
+ "waddhus wr7, wr7, wr11 \n\t"
+ "waddhus wr8, wr0, wr4 \n\t"
+ "waddhus wr9, wr1, wr5 \n\t"
+ "waddhus wr10, wr2, wr6 \n\t"
+ "waddhus wr11, wr3, wr7 \n\t"
+ "waddhus wr8, wr8, wr15 \n\t"
+ "waddhus wr9, wr9, wr15 \n\t"
+ "waddhus wr10, wr10, wr15 \n\t"
+ "waddhus wr11, wr11, wr15 \n\t"
+ "wsrlhg wr8, wr8, wcgr0 \n\t"
+ "wsrlhg wr9, wr9, wcgr0 \n\t"
+ "wsrlhg wr10, wr10, wcgr0 \n\t"
+ "wsrlhg wr11, wr11, wcgr0 \n\t"
+ "wpackhus wr8, wr8, wr9 \n\t"
+ "wpackhus wr9, wr10, wr11 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "wstrd wr9, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+
+ // [wr0 wr1 wr2 wr3] <= *
+ // [wr4 wr5 wr6 wr7]
+ "wldrd wr12, [%[pixels]] \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "wldrd wr14, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "walignr1 wr2, wr12, wr13 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr3, wr13, wr14 \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "wmoveq wr11, wr14 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "walignr2ne wr11, wr13, wr14 \n\t"
+ "wunpckelub wr0, wr2 \n\t"
+ "wunpckehub wr1, wr2 \n\t"
+ "wunpckelub wr2, wr3 \n\t"
+ "wunpckehub wr3, wr3 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "wunpckelub wr10, wr11 \n\t"
+ "wunpckehub wr11, wr11 \n\t"
+ "waddhus wr0, wr0, wr8 \n\t"
+ "waddhus wr1, wr1, wr9 \n\t"
+ "waddhus wr2, wr2, wr10 \n\t"
+ "waddhus wr3, wr3, wr11 \n\t"
+ "waddhus wr8, wr0, wr4 \n\t"
+ "waddhus wr9, wr1, wr5 \n\t"
+ "waddhus wr10, wr2, wr6 \n\t"
+ "waddhus wr11, wr3, wr7 \n\t"
+ "waddhus wr8, wr8, wr15 \n\t"
+ "waddhus wr9, wr9, wr15 \n\t"
+ "waddhus wr10, wr10, wr15 \n\t"
+ "waddhus wr11, wr11, wr15 \n\t"
+ "wsrlhg wr8, wr8, wcgr0 \n\t"
+ "wsrlhg wr9, wr9, wcgr0 \n\t"
+ "wsrlhg wr10, wr10, wcgr0 \n\t"
+ "wsrlhg wr11, wr11, wcgr0 \n\t"
+ "wpackhus wr8, wr8, wr9 \n\t"
+ "wpackhus wr9, wr10, wr11 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "wstrd wr9, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+
+ "subs %[h], %[h], #2 \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+ : [line_size]"r"(line_size)
+ : "r12", "memory");
+}
+
+void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "pld [%[pixels]] \n\t"
+ "mov r12, #2 \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "tmcr wcgr0, r12 \n\t" /* for shift value */
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+
+ // [wr0 wr1 wr2 wr3] <= *
+ // [wr4 wr5 wr6 wr7]
+ "wldrd wr12, [%[pixels]] \n\t"
+ "add r12, r12, #1 \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "tmcr wcgr2, r12 \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "cmp r12, #8 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr2, wr12, wr13 \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "wunpckelub wr0, wr2 \n\t"
+ "wunpckehub wr1, wr2 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "waddhus wr0, wr0, wr8 \n\t"
+ "waddhus wr1, wr1, wr9 \n\t"
+
+ "1: \n\t"
+ // [wr0 wr1 wr2 wr3]
+ // [wr4 wr5 wr6 wr7] <= *
+ "wldrd wr12, [%[pixels]] \n\t"
+ "cmp r12, #8 \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "walignr1 wr6, wr12, wr13 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "wunpckelub wr4, wr6 \n\t"
+ "wunpckehub wr5, wr6 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "waddhus wr4, wr4, wr8 \n\t"
+ "waddhus wr5, wr5, wr9 \n\t"
+ "waddhus wr8, wr0, wr4 \n\t"
+ "waddhus wr9, wr1, wr5 \n\t"
+ "waddhus wr8, wr8, wr15 \n\t"
+ "waddhus wr9, wr9, wr15 \n\t"
+ "wldrd wr12, [%[block]] \n\t"
+ "wsrlhg wr8, wr8, wcgr0 \n\t"
+ "wsrlhg wr9, wr9, wcgr0 \n\t"
+ "wpackhus wr8, wr8, wr9 \n\t"
+ WAVG2B" wr8, wr8, wr12 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "wldrd wr12, [%[pixels]] \n\t"
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+
+ // [wr0 wr1 wr2 wr3] <= *
+ // [wr4 wr5 wr6 wr7]
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "walignr1 wr2, wr12, wr13 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "wunpckelub wr0, wr2 \n\t"
+ "wunpckehub wr1, wr2 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "waddhus wr0, wr0, wr8 \n\t"
+ "waddhus wr1, wr1, wr9 \n\t"
+ "waddhus wr8, wr0, wr4 \n\t"
+ "waddhus wr9, wr1, wr5 \n\t"
+ "waddhus wr8, wr8, wr15 \n\t"
+ "waddhus wr9, wr9, wr15 \n\t"
+ "wldrd wr12, [%[block]] \n\t"
+ "wsrlhg wr8, wr8, wcgr0 \n\t"
+ "wsrlhg wr9, wr9, wcgr0 \n\t"
+ "wpackhus wr8, wr8, wr9 \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ WAVG2B" wr8, wr8, wr12 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+ : [line_size]"r"(line_size)
+ : "r12", "memory");
+}
+
+void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ // [wr0 wr1 wr2 wr3] for previous line
+ // [wr4 wr5 wr6 wr7] for current line
+ SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
+ __asm__ volatile(
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "pld [%[pixels]] \n\t"
+ "mov r12, #2 \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "tmcr wcgr0, r12 \n\t" /* for shift value */
+ /* alignment */
+ "and r12, %[pixels], #7 \n\t"
+ "bic %[pixels], %[pixels], #7 \n\t"
+ "tmcr wcgr1, r12 \n\t"
+ "add r12, r12, #1 \n\t"
+ "tmcr wcgr2, r12 \n\t"
+
+ // [wr0 wr1 wr2 wr3] <= *
+ // [wr4 wr5 wr6 wr7]
+ "wldrd wr12, [%[pixels]] \n\t"
+ "cmp r12, #8 \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "wldrd wr14, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "pld [%[pixels]] \n\t"
+ "walignr1 wr2, wr12, wr13 \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr3, wr13, wr14 \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "wmoveq wr11, wr14 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "walignr2ne wr11, wr13, wr14 \n\t"
+ "wunpckelub wr0, wr2 \n\t"
+ "wunpckehub wr1, wr2 \n\t"
+ "wunpckelub wr2, wr3 \n\t"
+ "wunpckehub wr3, wr3 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "wunpckelub wr10, wr11 \n\t"
+ "wunpckehub wr11, wr11 \n\t"
+ "waddhus wr0, wr0, wr8 \n\t"
+ "waddhus wr1, wr1, wr9 \n\t"
+ "waddhus wr2, wr2, wr10 \n\t"
+ "waddhus wr3, wr3, wr11 \n\t"
+
+ "1: \n\t"
+ // [wr0 wr1 wr2 wr3]
+ // [wr4 wr5 wr6 wr7] <= *
+ "wldrd wr12, [%[pixels]] \n\t"
+ "cmp r12, #8 \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "wldrd wr14, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "walignr1 wr6, wr12, wr13 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr7, wr13, wr14 \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "wmoveq wr11, wr14 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "walignr2ne wr11, wr13, wr14 \n\t"
+ "wunpckelub wr4, wr6 \n\t"
+ "wunpckehub wr5, wr6 \n\t"
+ "wunpckelub wr6, wr7 \n\t"
+ "wunpckehub wr7, wr7 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "wunpckelub wr10, wr11 \n\t"
+ "wunpckehub wr11, wr11 \n\t"
+ "waddhus wr4, wr4, wr8 \n\t"
+ "waddhus wr5, wr5, wr9 \n\t"
+ "waddhus wr6, wr6, wr10 \n\t"
+ "waddhus wr7, wr7, wr11 \n\t"
+ "waddhus wr8, wr0, wr4 \n\t"
+ "waddhus wr9, wr1, wr5 \n\t"
+ "waddhus wr10, wr2, wr6 \n\t"
+ "waddhus wr11, wr3, wr7 \n\t"
+ "waddhus wr8, wr8, wr15 \n\t"
+ "waddhus wr9, wr9, wr15 \n\t"
+ "waddhus wr10, wr10, wr15 \n\t"
+ "waddhus wr11, wr11, wr15 \n\t"
+ "wsrlhg wr8, wr8, wcgr0 \n\t"
+ "wsrlhg wr9, wr9, wcgr0 \n\t"
+ "wldrd wr12, [%[block]] \n\t"
+ "wldrd wr13, [%[block], #8] \n\t"
+ "wsrlhg wr10, wr10, wcgr0 \n\t"
+ "wsrlhg wr11, wr11, wcgr0 \n\t"
+ "wpackhus wr8, wr8, wr9 \n\t"
+ "wpackhus wr9, wr10, wr11 \n\t"
+ WAVG2B" wr8, wr8, wr12 \n\t"
+ WAVG2B" wr9, wr9, wr13 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "wstrd wr9, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+
+ // [wr0 wr1 wr2 wr3] <= *
+ // [wr4 wr5 wr6 wr7]
+ "wldrd wr12, [%[pixels]] \n\t"
+ "pld [%[block]] \n\t"
+ "wldrd wr13, [%[pixels], #8] \n\t"
+ "pld [%[block], #32] \n\t"
+ "wldrd wr14, [%[pixels], #16] \n\t"
+ "add %[pixels], %[pixels], %[line_size] \n\t"
+ "walignr1 wr2, wr12, wr13 \n\t"
+ "pld [%[pixels]] \n\t"
+ "pld [%[pixels], #32] \n\t"
+ "walignr1 wr3, wr13, wr14 \n\t"
+ "wmoveq wr10, wr13 \n\t"
+ "wmoveq wr11, wr14 \n\t"
+ "walignr2ne wr10, wr12, wr13 \n\t"
+ "walignr2ne wr11, wr13, wr14 \n\t"
+ "wunpckelub wr0, wr2 \n\t"
+ "wunpckehub wr1, wr2 \n\t"
+ "wunpckelub wr2, wr3 \n\t"
+ "wunpckehub wr3, wr3 \n\t"
+ "wunpckelub wr8, wr10 \n\t"
+ "wunpckehub wr9, wr10 \n\t"
+ "wunpckelub wr10, wr11 \n\t"
+ "wunpckehub wr11, wr11 \n\t"
+ "waddhus wr0, wr0, wr8 \n\t"
+ "waddhus wr1, wr1, wr9 \n\t"
+ "waddhus wr2, wr2, wr10 \n\t"
+ "waddhus wr3, wr3, wr11 \n\t"
+ "waddhus wr8, wr0, wr4 \n\t"
+ "waddhus wr9, wr1, wr5 \n\t"
+ "waddhus wr10, wr2, wr6 \n\t"
+ "waddhus wr11, wr3, wr7 \n\t"
+ "waddhus wr8, wr8, wr15 \n\t"
+ "waddhus wr9, wr9, wr15 \n\t"
+ "waddhus wr10, wr10, wr15 \n\t"
+ "waddhus wr11, wr11, wr15 \n\t"
+ "wsrlhg wr8, wr8, wcgr0 \n\t"
+ "wsrlhg wr9, wr9, wcgr0 \n\t"
+ "wldrd wr12, [%[block]] \n\t"
+ "wldrd wr13, [%[block], #8] \n\t"
+ "wsrlhg wr10, wr10, wcgr0 \n\t"
+ "wsrlhg wr11, wr11, wcgr0 \n\t"
+ "wpackhus wr8, wr8, wr9 \n\t"
+ "wpackhus wr9, wr10, wr11 \n\t"
+ WAVG2B" wr8, wr8, wr12 \n\t"
+ WAVG2B" wr9, wr9, wr13 \n\t"
+ "wstrd wr8, [%[block]] \n\t"
+ "wstrd wr9, [%[block], #8] \n\t"
+ "add %[block], %[block], %[line_size] \n\t"
+ "subs %[h], %[h], #2 \n\t"
+ "pld [%[block]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "bne 1b \n\t"
+ : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
+ : [line_size]"r"(line_size)
+ : "r12", "memory");
+}
diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
new file mode 100644
index 0000000000..5204c50e37
--- /dev/null
+++ b/libavcodec/arm/dsputil_neon.c
@@ -0,0 +1,169 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+
+void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+
+void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+
+void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int);
+
+void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int);
+void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int);
+
+void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+
+void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+
+void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+
+void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+{
+ c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
+ c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
+ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
+
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
+ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
+
+ c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
+ c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
+ c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
+ c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
+ c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
+ c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
+ c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
+ c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
+ c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
+ c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
+ c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
+ c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
+ c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
+ c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
+ c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
+ c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
+
+ c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
+ c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
+ c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
+ c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
+ c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
+ c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
+ c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
+ c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
+ c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
+ c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
+ c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
+ c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
+ c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
+ c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
+ c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
+ c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
+
+ c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
+
+ c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
+ c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
+ c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+ c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+
+ c->h264_idct_add = ff_h264_idct_add_neon;
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
+}
diff --git a/libavcodec/arm/dsputil_neon_s.S b/libavcodec/arm/dsputil_neon_s.S
new file mode 100644
index 0000000000..1241cdc9ef
--- /dev/null
+++ b/libavcodec/arm/dsputil_neon_s.S
@@ -0,0 +1,274 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+ preserve8
+ .fpu neon
+ .text
+
+ .macro pixels16 avg=0
+.if \avg
+ mov ip, r0
+.endif
+1: vld1.64 {d0, d1}, [r1], r2
+ vld1.64 {d2, d3}, [r1], r2
+ vld1.64 {d4, d5}, [r1], r2
+ pld [r1, r2, lsl #2]
+ vld1.64 {d6, d7}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ pld [r1, r2, lsl #1]
+.if \avg
+ vld1.64 {d16,d17}, [ip], r2
+ vrhadd.u8 q0, q0, q8
+ vld1.64 {d18,d19}, [ip], r2
+ vrhadd.u8 q1, q1, q9
+ vld1.64 {d20,d21}, [ip], r2
+ vrhadd.u8 q2, q2, q10
+ vld1.64 {d22,d23}, [ip], r2
+ vrhadd.u8 q3, q3, q11
+.endif
+ subs r3, r3, #4
+ vst1.64 {d0, d1}, [r0,:128], r2
+ vst1.64 {d2, d3}, [r0,:128], r2
+ vst1.64 {d4, d5}, [r0,:128], r2
+ vst1.64 {d6, d7}, [r0,:128], r2
+ bne 1b
+ bx lr
+ .endm
+
+ .macro pixels16_x2 vhadd=vrhadd.u8
+1: vld1.64 {d0-d2}, [r1], r2
+ vld1.64 {d4-d6}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ subs r3, r3, #2
+ vext.8 q1, q0, q1, #1
+ \vhadd q0, q0, q1
+ vext.8 q3, q2, q3, #1
+ \vhadd q2, q2, q3
+ vst1.64 {d0, d1}, [r0,:128], r2
+ vst1.64 {d4, d5}, [r0,:128], r2
+ bne 1b
+ bx lr
+ .endm
+
+ .macro pixels16_y2 vhadd=vrhadd.u8
+ push {lr}
+ add ip, r1, r2
+ lsl lr, r2, #1
+ vld1.64 {d0, d1}, [r1], lr
+ vld1.64 {d2, d3}, [ip], lr
+1: subs r3, r3, #2
+ \vhadd q2, q0, q1
+ vld1.64 {d0, d1}, [r1], lr
+ \vhadd q3, q0, q1
+ vld1.64 {d2, d3}, [ip], lr
+ pld [r1]
+ pld [ip]
+ vst1.64 {d4, d5}, [r0,:128], r2
+ vst1.64 {d6, d7}, [r0,:128], r2
+ bne 1b
+ pop {pc}
+ .endm
+
+ .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
+ push {lr}
+ lsl lr, r2, #1
+ add ip, r1, r2
+ vld1.64 {d0-d2}, [r1], lr
+ vld1.64 {d4-d6}, [ip], lr
+.if \no_rnd
+ vmov.i16 q13, #1
+.endif
+ pld [r1]
+ pld [ip]
+ vext.8 q1, q0, q1, #1
+ vext.8 q3, q2, q3, #1
+ vaddl.u8 q8, d0, d2
+ vaddl.u8 q10, d1, d3
+ vaddl.u8 q9, d4, d6
+ vaddl.u8 q11, d5, d7
+1: subs r3, r3, #2
+ vld1.64 {d0-d2}, [r1], lr
+ vadd.u16 q12, q8, q9
+ pld [r1]
+.if \no_rnd
+ vadd.u16 q12, q12, q13
+.endif
+ vext.8 q15, q0, q1, #1
+ vadd.u16 q1 , q10, q11
+ \vshrn d28, q12, #2
+.if \no_rnd
+ vadd.u16 q1, q1, q13
+.endif
+ \vshrn d29, q1, #2
+ vaddl.u8 q8, d0, d30
+ vld1.64 {d2-d4}, [ip], lr
+ vaddl.u8 q10, d1, d31
+ vst1.64 {d28,d29}, [r0,:128], r2
+ vadd.u16 q12, q8, q9
+ pld [ip]
+.if \no_rnd
+ vadd.u16 q12, q12, q13
+.endif
+ vext.8 q2, q1, q2, #1
+ vadd.u16 q0, q10, q11
+ \vshrn d30, q12, #2
+.if \no_rnd
+ vadd.u16 q0, q0, q13
+.endif
+ \vshrn d31, q0, #2
+ vaddl.u8 q9, d2, d4
+ vaddl.u8 q11, d3, d5
+ vst1.64 {d30,d31}, [r0,:128], r2
+ bgt 1b
+ pop {pc}
+ .endm
+
+ .macro pixels8
+1: vld1.64 {d0}, [r1], r2
+ vld1.64 {d1}, [r1], r2
+ vld1.64 {d2}, [r1], r2
+ pld [r1, r2, lsl #2]
+ vld1.64 {d3}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ pld [r1, r2, lsl #1]
+ subs r3, r3, #4
+ vst1.64 {d0}, [r0,:64], r2
+ vst1.64 {d1}, [r0,:64], r2
+ vst1.64 {d2}, [r0,:64], r2
+ vst1.64 {d3}, [r0,:64], r2
+ bne 1b
+ bx lr
+ .endm
+
+ .macro pixels8_x2 vhadd=vrhadd.u8
+1: vld1.64 {d0, d1}, [r1], r2
+ vext.8 d1, d0, d1, #1
+ vld1.64 {d2, d3}, [r1], r2
+ vext.8 d3, d2, d3, #1
+ pld [r1]
+ pld [r1, r2]
+ subs r3, r3, #2
+ vswp d1, d2
+ \vhadd q0, q0, q1
+ vst1.64 {d0}, [r0,:64], r2
+ vst1.64 {d1}, [r0,:64], r2
+ bne 1b
+ bx lr
+ .endm
+
+ .macro pixels8_y2 vhadd=vrhadd.u8
+ push {lr}
+ add ip, r1, r2
+ lsl lr, r2, #1
+ vld1.64 {d0}, [r1], lr
+ vld1.64 {d1}, [ip], lr
+1: subs r3, r3, #2
+ \vhadd d4, d0, d1
+ vld1.64 {d0}, [r1], lr
+ \vhadd d5, d0, d1
+ vld1.64 {d1}, [ip], lr
+ pld [r1]
+ pld [ip]
+ vst1.64 {d4}, [r0,:64], r2
+ vst1.64 {d5}, [r0,:64], r2
+ bne 1b
+ pop {pc}
+ .endm
+
+ .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
+ push {lr}
+ lsl lr, r2, #1
+ add ip, r1, r2
+ vld1.64 {d0, d1}, [r1], lr
+ vld1.64 {d2, d3}, [ip], lr
+.if \no_rnd
+ vmov.i16 q11, #1
+.endif
+ pld [r1]
+ pld [ip]
+ vext.8 d4, d0, d1, #1
+ vext.8 d6, d2, d3, #1
+ vaddl.u8 q8, d0, d4
+ vaddl.u8 q9, d2, d6
+1: subs r3, r3, #2
+ vld1.64 {d0, d1}, [r1], lr
+ pld [r1]
+ vadd.u16 q10, q8, q9
+ vext.8 d4, d0, d1, #1
+.if \no_rnd
+ vadd.u16 q10, q10, q11
+.endif
+ vaddl.u8 q8, d0, d4
+ \vshrn d5, q10, #2
+ vld1.64 {d2, d3}, [ip], lr
+ vadd.u16 q10, q8, q9
+ pld [ip]
+.if \no_rnd
+ vadd.u16 q10, q10, q11
+.endif
+ vst1.64 {d5}, [r0,:64], r2
+ \vshrn d7, q10, #2
+ vext.8 d6, d2, d3, #1
+ vaddl.u8 q9, d2, d6
+ vst1.64 {d7}, [r0,:64], r2
+ bgt 1b
+ pop {pc}
+ .endm
+
+ .macro pixfunc pfx name suf rnd_op args:vararg
+function ff_\pfx\name\suf\()_neon, export=1
+ \name \rnd_op \args
+ .endfunc
+ .endm
+
+ .macro pixfunc2 pfx name args:vararg
+ pixfunc \pfx \name
+ pixfunc \pfx \name \args
+ .endm
+
+function ff_put_h264_qpel16_mc00_neon, export=1
+ mov r3, #16
+ .endfunc
+
+ pixfunc put_ pixels16
+ pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
+ pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
+ pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
+
+function ff_avg_h264_qpel16_mc00_neon, export=1
+ mov r3, #16
+ .endfunc
+
+ pixfunc avg_ pixels16,, 1
+
+function ff_put_h264_qpel8_mc00_neon, export=1
+ mov r3, #8
+ .endfunc
+
+ pixfunc put_ pixels8
+ pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
+ pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
+ pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S
new file mode 100644
index 0000000000..04c8014e1e
--- /dev/null
+++ b/libavcodec/arm/dsputil_vfp.S
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+ .fpu neon @ required for gas to accept UAL syntax
+/*
+ * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
+ * throughput for almost all the instructions (except for double precision
+ * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
+ * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
+ * important for performance. One more interesting feature is that VFP has
+ * independent load/store and arithmetics pipelines, so it is possible to make
+ * them work simultaneously and get more than 1 operation per cycle. Load/store
+ * pipeline can process 2 single precision floating point values per cycle and
+ * supports bulk loads and stores for large sets of registers. Arithmetic operations
+ * can be done on vectors, which allows to keep the arithmetics pipeline busy,
+ * while the processor may issue and execute other instructions. Detailed
+ * optimization manuals can be found at http://www.arm.com
+ */
+
+/**
+ * ARM VFP optimized implementation of 'vector_fmul_c' function.
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_vfp(float *dst, const float *src, int len)
+function ff_vector_fmul_vfp, export=1
+ vpush {d8-d15}
+ mov r3, r0
+ fmrx r12, fpscr
+ orr r12, r12, #(3 << 16) /* set vector size to 4 */
+ fmxr fpscr, r12
+
+ vldmia r3!, {s0-s3}
+ vldmia r1!, {s8-s11}
+ vldmia r3!, {s4-s7}
+ vldmia r1!, {s12-s15}
+ vmul.f32 s8, s0, s8
+1:
+ subs r2, r2, #16
+ vmul.f32 s12, s4, s12
+ vldmiage r3!, {s16-s19}
+ vldmiage r1!, {s24-s27}
+ vldmiage r3!, {s20-s23}
+ vldmiage r1!, {s28-s31}
+ vmulge.f32 s24, s16, s24
+ vstmia r0!, {s8-s11}
+ vstmia r0!, {s12-s15}
+ vmulge.f32 s28, s20, s28
+ vldmiagt r3!, {s0-s3}
+ vldmiagt r1!, {s8-s11}
+ vldmiagt r3!, {s4-s7}
+ vldmiagt r1!, {s12-s15}
+ vmulge.f32 s8, s0, s8
+ vstmiage r0!, {s24-s27}
+ vstmiage r0!, {s28-s31}
+ bgt 1b
+
+ bic r12, r12, #(7 << 16) /* set vector size back to 1 */
+ fmxr fpscr, r12
+ vpop {d8-d15}
+ bx lr
+ .endfunc
+
+/**
+ * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
+ * Assume that len is a positive number and is multiple of 8
+ */
+@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+@ const float *src1, int len)
+function ff_vector_fmul_reverse_vfp, export=1
+ vpush {d8-d15}
+ add r2, r2, r3, lsl #2
+ vldmdb r2!, {s0-s3}
+ vldmia r1!, {s8-s11}
+ vldmdb r2!, {s4-s7}
+ vldmia r1!, {s12-s15}
+ vmul.f32 s8, s3, s8
+ vmul.f32 s9, s2, s9
+ vmul.f32 s10, s1, s10
+ vmul.f32 s11, s0, s11
+1:
+ subs r3, r3, #16
+ vldmdbge r2!, {s16-s19}
+ vmul.f32 s12, s7, s12
+ vldmiage r1!, {s24-s27}
+ vmul.f32 s13, s6, s13
+ vldmdbge r2!, {s20-s23}
+ vmul.f32 s14, s5, s14
+ vldmiage r1!, {s28-s31}
+ vmul.f32 s15, s4, s15
+ vmulge.f32 s24, s19, s24
+ vldmdbgt r2!, {s0-s3}
+ vmulge.f32 s25, s18, s25
+ vstmia r0!, {s8-s13}
+ vmulge.f32 s26, s17, s26
+ vldmiagt r1!, {s8-s11}
+ vmulge.f32 s27, s16, s27
+ vmulge.f32 s28, s23, s28
+ vldmdbgt r2!, {s4-s7}
+ vmulge.f32 s29, s22, s29
+ vstmia r0!, {s14-s15}
+ vmulge.f32 s30, s21, s30
+ vmulge.f32 s31, s20, s31
+ vmulge.f32 s8, s3, s8
+ vldmiagt r1!, {s12-s15}
+ vmulge.f32 s9, s2, s9
+ vmulge.f32 s10, s1, s10
+ vstmiage r0!, {s24-s27}
+ vmulge.f32 s11, s0, s11
+ vstmiage r0!, {s28-s31}
+ bgt 1b
+
+ vpop {d8-d15}
+ bx lr
+ .endfunc
+
+#ifdef HAVE_ARMV6
+/**
+ * ARM VFP optimized float to int16 conversion.
+ * Assume that len is a positive number and is multiple of 8, destination
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for
+ * performance), little endian byte sex
+ */
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
+function ff_float_to_int16_vfp, export=1
+ push {r4-r8,lr}
+ vpush {d8-d11}
+ vldmia r1!, {s16-s23}
+ vcvt.s32.f32 s0, s16
+ vcvt.s32.f32 s1, s17
+ vcvt.s32.f32 s2, s18
+ vcvt.s32.f32 s3, s19
+ vcvt.s32.f32 s4, s20
+ vcvt.s32.f32 s5, s21
+ vcvt.s32.f32 s6, s22
+ vcvt.s32.f32 s7, s23
+1:
+ subs r2, r2, #8
+ vmov r3, r4, s0, s1
+ vmov r5, r6, s2, s3
+ vmov r7, r8, s4, s5
+ vmov ip, lr, s6, s7
+ vldmiagt r1!, {s16-s23}
+ ssat r4, #16, r4
+ ssat r3, #16, r3
+ ssat r6, #16, r6
+ ssat r5, #16, r5
+ pkhbt r3, r3, r4, lsl #16
+ pkhbt r4, r5, r6, lsl #16
+ vcvtgt.s32.f32 s0, s16
+ vcvtgt.s32.f32 s1, s17
+ vcvtgt.s32.f32 s2, s18
+ vcvtgt.s32.f32 s3, s19
+ vcvtgt.s32.f32 s4, s20
+ vcvtgt.s32.f32 s5, s21
+ vcvtgt.s32.f32 s6, s22
+ vcvtgt.s32.f32 s7, s23
+ ssat r8, #16, r8
+ ssat r7, #16, r7
+ ssat lr, #16, lr
+ ssat ip, #16, ip
+ pkhbt r5, r7, r8, lsl #16
+ pkhbt r6, ip, lr, lsl #16
+ stmia r0!, {r3-r6}
+ bgt 1b
+
+ vpop {d8-d11}
+ pop {r4-r8,pc}
+ .endfunc
+#endif
diff --git a/libavcodec/arm/float_arm_vfp.c b/libavcodec/arm/float_arm_vfp.c
new file mode 100644
index 0000000000..5598aa9c02
--- /dev/null
+++ b/libavcodec/arm/float_arm_vfp.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/dsputil.h"
+
+void ff_vector_fmul_vfp(float *dst, const float *src, int len);
+void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
+ const float *src1, int len);
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
+
+void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx)
+{
+ c->vector_fmul = ff_vector_fmul_vfp;
+ c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
+#ifdef HAVE_ARMV6
+ c->float_to_int16 = ff_float_to_int16_vfp;
+#endif
+}
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
new file mode 100644
index 0000000000..39a8daf62e
--- /dev/null
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -0,0 +1,1377 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+ .fpu neon
+
+ .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
+ vtrn.32 \r0, \r4
+ vtrn.32 \r1, \r5
+ vtrn.32 \r2, \r6
+ vtrn.32 \r3, \r7
+ vtrn.16 \r0, \r2
+ vtrn.16 \r1, \r3
+ vtrn.16 \r4, \r6
+ vtrn.16 \r5, \r7
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+ vtrn.8 \r4, \r5
+ vtrn.8 \r6, \r7
+ .endm
+
+ .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
+ vswp \r0, \r4
+ vswp \r1, \r5
+ vswp \r2, \r6
+ vswp \r3, \r7
+ .endm
+
+ .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.32 \r4, \r6
+ vtrn.32 \r5, \r7
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+ vtrn.16 \r4, \r5
+ vtrn.16 \r6, \r7
+ .endm
+
+/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+ .macro h264_chroma_mc8 avg=0
+ push {r4-r7, lr}
+ ldrd r4, [sp, #20]
+.if \avg
+ mov lr, r0
+.endif
+ pld [r1]
+ pld [r1, r2]
+
+ muls r7, r4, r5
+ rsb r6, r7, r5, lsl #3
+ rsb ip, r7, r4, lsl #3
+ sub r4, r7, r4, lsl #3
+ sub r4, r4, r5, lsl #3
+ add r4, r4, #64
+
+ beq 2f
+
+ add r5, r1, r2
+
+ vdup.8 d0, r4
+ lsl r4, r2, #1
+ vdup.8 d1, ip
+ vld1.64 {d4, d5}, [r1], r4
+ vdup.8 d2, r6
+ vld1.64 {d6, d7}, [r5], r4
+ vdup.8 d3, r7
+
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+
+1: pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vld1.64 {d4, d5}, [r1], r4
+ vmlal.u8 q8, d6, d2
+ vext.8 d5, d4, d5, #1
+ vmlal.u8 q8, d7, d3
+ vmull.u8 q9, d6, d0
+ subs r3, r3, #2
+ vmlal.u8 q9, d7, d1
+ vmlal.u8 q9, d4, d2
+ vmlal.u8 q9, d5, d3
+ vrshrn.u16 d16, q8, #6
+ vld1.64 {d6, d7}, [r5], r4
+ pld [r1]
+ vrshrn.u16 d17, q9, #6
+.if \avg
+ vld1.64 {d20}, [lr,:64], r2
+ vld1.64 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+.endif
+ vext.8 d7, d6, d7, #1
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r0,:64], r2
+ bgt 1b
+
+ pop {r4-r7, pc}
+
+2: tst r6, r6
+ add ip, ip, r6
+ vdup.8 d0, r4
+ vdup.8 d1, ip
+
+ beq 4f
+
+ add r5, r1, r2
+ lsl r4, r2, #1
+ vld1.64 {d4}, [r1], r4
+ vld1.64 {d6}, [r5], r4
+
+3: pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d1
+ vld1.64 {d4}, [r1], r4
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d1
+ vld1.64 {d6}, [r5], r4
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+.if \avg
+ vld1.64 {d20}, [lr,:64], r2
+ vld1.64 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+.endif
+ subs r3, r3, #2
+ pld [r1]
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r0,:64], r2
+ bgt 3b
+
+ pop {r4-r7, pc}
+
+4: vld1.64 {d4, d5}, [r1], r2
+ vld1.64 {d6, d7}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+
+5: pld [r1]
+ subs r3, r3, #2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vld1.64 {d4, d5}, [r1], r2
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d7, d1
+ pld [r1]
+ vext.8 d5, d4, d5, #1
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+.if \avg
+ vld1.64 {d20}, [lr,:64], r2
+ vld1.64 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+.endif
+ vld1.64 {d6, d7}, [r1], r2
+ vext.8 d7, d6, d7, #1
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r0,:64], r2
+ bgt 5b
+
+ pop {r4-r7, pc}
+ .endm
+
+/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+ .macro h264_chroma_mc4 avg=0
+ push {r4-r7, lr}
+ ldrd r4, [sp, #20]
+.if \avg
+ mov lr, r0
+.endif
+ pld [r1]
+ pld [r1, r2]
+
+ muls r7, r4, r5
+ rsb r6, r7, r5, lsl #3
+ rsb ip, r7, r4, lsl #3
+ sub r4, r7, r4, lsl #3
+ sub r4, r4, r5, lsl #3
+ add r4, r4, #64
+
+ beq 2f
+
+ add r5, r1, r2
+
+ vdup.8 d0, r4
+ lsl r4, r2, #1
+ vdup.8 d1, ip
+ vld1.64 {d4}, [r1], r4
+ vdup.8 d2, r6
+ vld1.64 {d6}, [r5], r4
+ vdup.8 d3, r7
+
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
+
+1: pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d2
+ vld1.64 {d4}, [r1], r4
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d2
+ vld1.64 {d6}, [r5], r4
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vrshrn.u16 d16, q8, #6
+ subs r3, r3, #2
+ pld [r1]
+.if \avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+.endif
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 1b
+
+ pop {r4-r7, pc}
+
+2: tst r6, r6
+ add ip, ip, r6
+ vdup.8 d0, r4
+ vdup.8 d1, ip
+ vtrn.32 d0, d1
+
+ beq 4f
+
+ vext.32 d1, d0, d1, #1
+ add r5, r1, r2
+ lsl r4, r2, #1
+ vld1.32 {d4[0]}, [r1], r4
+ vld1.32 {d4[1]}, [r5], r4
+
+3: pld [r5]
+ vmull.u8 q8, d4, d0
+ vld1.32 {d4[0]}, [r1], r4
+ vmull.u8 q9, d4, d1
+ vld1.32 {d4[1]}, [r5], r4
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vrshrn.u16 d16, q8, #6
+.if \avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+.endif
+ subs r3, r3, #2
+ pld [r1]
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 3b
+
+ pop {r4-r7, pc}
+
+4: vld1.64 {d4}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+
+5: vmull.u8 q8, d4, d0
+ vmull.u8 q9, d6, d0
+ subs r3, r3, #2
+ vld1.64 {d4}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ pld [r1]
+ vrshrn.u16 d16, q8, #6
+.if \avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+.endif
+ vld1.64 {d6}, [r1], r2
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ pld [r1]
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 5b
+
+ pop {r4-r7, pc}
+ .endm
+
+ .text
+ .align
+
+function ff_put_h264_chroma_mc8_neon, export=1
+ h264_chroma_mc8
+ .endfunc
+
+function ff_avg_h264_chroma_mc8_neon, export=1
+ h264_chroma_mc8 avg=1
+ .endfunc
+
+function ff_put_h264_chroma_mc4_neon, export=1
+ h264_chroma_mc4
+ .endfunc
+
+function ff_avg_h264_chroma_mc4_neon, export=1
+ h264_chroma_mc4 avg=1
+ .endfunc
+
+ /* H.264 loop filter */
+
+ .macro h264_loop_filter_start
+ ldr ip, [sp]
+ tst r2, r2
+ ldr ip, [ip]
+ tstne r3, r3
+ vmov.32 d24[0], ip
+ and ip, ip, ip, lsl #16
+ bxeq lr
+ ands ip, ip, ip, lsl #8
+ bxlt lr
+ .endm
+
+ .macro align_push_regs
+ and ip, sp, #15
+ add ip, ip, #32
+ sub sp, sp, ip
+ vst1.64 {d12-d15}, [sp,:128]
+ sub sp, sp, #32
+ vst1.64 {d8-d11}, [sp,:128]
+ .endm
+
+ .macro align_pop_regs
+ vld1.64 {d8-d11}, [sp,:128]!
+ vld1.64 {d12-d15}, [sp,:128], ip
+ .endm
+
+ .macro h264_loop_filter_luma
+ vdup.8 q11, r2 @ alpha
+ vmovl.u8 q12, d24
+ vabd.u8 q6, q8, q0 @ abs(p0 - q0)
+ vmovl.u16 q12, d24
+ vabd.u8 q14, q9, q8 @ abs(p1 - p0)
+ vsli.16 q12, q12, #8
+ vabd.u8 q15, q1, q0 @ abs(q1 - q0)
+ vsli.32 q12, q12, #16
+ vclt.u8 q6, q6, q11 @ < alpha
+ vdup.8 q11, r3 @ beta
+ vclt.s8 q7, q12, #0
+ vclt.u8 q14, q14, q11 @ < beta
+ vclt.u8 q15, q15, q11 @ < beta
+ vbic q6, q6, q7
+ vabd.u8 q4, q10, q8 @ abs(p2 - p0)
+ vand q6, q6, q14
+ vabd.u8 q5, q2, q0 @ abs(q2 - q0)
+ vclt.u8 q4, q4, q11 @ < beta
+ vand q6, q6, q15
+ vclt.u8 q5, q5, q11 @ < beta
+ vand q4, q4, q6
+ vand q5, q5, q6
+ vand q12, q12, q6
+ vrhadd.u8 q14, q8, q0
+ vsub.i8 q6, q12, q4
+ vqadd.u8 q7, q9, q12
+ vhadd.u8 q10, q10, q14
+ vsub.i8 q6, q6, q5
+ vhadd.u8 q14, q2, q14
+ vmin.u8 q7, q7, q10
+ vqsub.u8 q11, q9, q12
+ vqadd.u8 q2, q1, q12
+ vmax.u8 q7, q7, q11
+ vqsub.u8 q11, q1, q12
+ vmin.u8 q14, q2, q14
+ vmovl.u8 q2, d0
+ vmax.u8 q14, q14, q11
+ vmovl.u8 q10, d1
+ vsubw.u8 q2, q2, d16
+ vsubw.u8 q10, q10, d17
+ vshl.i16 q2, q2, #2
+ vshl.i16 q10, q10, #2
+ vaddw.u8 q2, q2, d18
+ vaddw.u8 q10, q10, d19
+ vsubw.u8 q2, q2, d2
+ vsubw.u8 q10, q10, d3
+ vrshrn.i16 d4, q2, #3
+ vrshrn.i16 d5, q10, #3
+ vbsl q4, q7, q9
+ vbsl q5, q14, q1
+ vneg.s8 q7, q6
+ vmovl.u8 q14, d16
+ vmin.s8 q2, q2, q6
+ vmovl.u8 q6, d17
+ vmax.s8 q2, q2, q7
+ vmovl.u8 q11, d0
+ vmovl.u8 q12, d1
+ vaddw.s8 q14, q14, d4
+ vaddw.s8 q6, q6, d5
+ vsubw.s8 q11, q11, d4
+ vsubw.s8 q12, q12, d5
+ vqmovun.s16 d16, q14
+ vqmovun.s16 d17, q6
+ vqmovun.s16 d0, q11
+ vqmovun.s16 d1, q12
+ .endm
+
+function ff_h264_v_loop_filter_luma_neon, export=1
+ h264_loop_filter_start
+
+ vld1.64 {d0, d1}, [r0,:128], r1
+ vld1.64 {d2, d3}, [r0,:128], r1
+ vld1.64 {d4, d5}, [r0,:128], r1
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+ vld1.64 {d20,d21}, [r0,:128], r1
+ vld1.64 {d18,d19}, [r0,:128], r1
+ vld1.64 {d16,d17}, [r0,:128], r1
+
+ align_push_regs
+
+ h264_loop_filter_luma
+
+ sub r0, r0, r1, lsl #1
+ vst1.64 {d8, d9}, [r0,:128], r1
+ vst1.64 {d16,d17}, [r0,:128], r1
+ vst1.64 {d0, d1}, [r0,:128], r1
+ vst1.64 {d10,d11}, [r0,:128]
+
+ align_pop_regs
+ bx lr
+ .endfunc
+
+function ff_h264_h_loop_filter_luma_neon, export=1
+ h264_loop_filter_start
+
+ sub r0, r0, #4
+ vld1.64 {d6}, [r0], r1
+ vld1.64 {d20}, [r0], r1
+ vld1.64 {d18}, [r0], r1
+ vld1.64 {d16}, [r0], r1
+ vld1.64 {d0}, [r0], r1
+ vld1.64 {d2}, [r0], r1
+ vld1.64 {d4}, [r0], r1
+ vld1.64 {d26}, [r0], r1
+ vld1.64 {d7}, [r0], r1
+ vld1.64 {d21}, [r0], r1
+ vld1.64 {d19}, [r0], r1
+ vld1.64 {d17}, [r0], r1
+ vld1.64 {d1}, [r0], r1
+ vld1.64 {d3}, [r0], r1
+ vld1.64 {d5}, [r0], r1
+ vld1.64 {d27}, [r0], r1
+
+ transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
+
+ align_push_regs
+ sub sp, sp, #16
+ vst1.64 {d4, d5}, [sp,:128]
+ sub sp, sp, #16
+ vst1.64 {d20,d21}, [sp,:128]
+
+ h264_loop_filter_luma
+
+ vld1.64 {d20,d21}, [sp,:128]!
+ vld1.64 {d4, d5}, [sp,:128]!
+
+ transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13
+
+ sub r0, r0, r1, lsl #4
+ vst1.64 {d6}, [r0], r1
+ vst1.64 {d20}, [r0], r1
+ vst1.64 {d8}, [r0], r1
+ vst1.64 {d16}, [r0], r1
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d10}, [r0], r1
+ vst1.64 {d4}, [r0], r1
+ vst1.64 {d26}, [r0], r1
+ vst1.64 {d7}, [r0], r1
+ vst1.64 {d21}, [r0], r1
+ vst1.64 {d9}, [r0], r1
+ vst1.64 {d17}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d11}, [r0], r1
+ vst1.64 {d5}, [r0], r1
+ vst1.64 {d27}, [r0], r1
+
+ align_pop_regs
+ bx lr
+ .endfunc
+
+ .macro h264_loop_filter_chroma
+ vdup.8 d22, r2 @ alpha
+ vmovl.u8 q12, d24
+ vabd.u8 d26, d16, d0 @ abs(p0 - q0)
+ vmovl.u8 q2, d0
+ vabd.u8 d28, d18, d16 @ abs(p1 - p0)
+ vsubw.u8 q2, q2, d16
+ vsli.16 d24, d24, #8
+ vshl.i16 q2, q2, #2
+ vabd.u8 d30, d2, d0 @ abs(q1 - q0)
+ vaddw.u8 q2, q2, d18
+ vclt.u8 d26, d26, d22 @ < alpha
+ vsubw.u8 q2, q2, d2
+ vdup.8 d22, r3 @ beta
+ vclt.s8 d25, d24, #0
+ vrshrn.i16 d4, q2, #3
+ vclt.u8 d28, d28, d22 @ < beta
+ vbic d26, d26, d25
+ vclt.u8 d30, d30, d22 @ < beta
+ vand d26, d26, d28
+ vneg.s8 d25, d24
+ vand d26, d26, d30
+ vmin.s8 d4, d4, d24
+ vmovl.u8 q14, d16
+ vand d4, d4, d26
+ vmax.s8 d4, d4, d25
+ vmovl.u8 q11, d0
+ vaddw.s8 q14, q14, d4
+ vsubw.s8 q11, q11, d4
+ vqmovun.s16 d16, q14
+ vqmovun.s16 d0, q11
+ .endm
+
+function ff_h264_v_loop_filter_chroma_neon, export=1
+ h264_loop_filter_start
+
+ sub r0, r0, r1, lsl #1
+ vld1.64 {d18}, [r0,:64], r1
+ vld1.64 {d16}, [r0,:64], r1
+ vld1.64 {d0}, [r0,:64], r1
+ vld1.64 {d2}, [r0,:64]
+
+ h264_loop_filter_chroma
+
+ sub r0, r0, r1, lsl #1
+ vst1.64 {d16}, [r0,:64], r1
+ vst1.64 {d0}, [r0,:64], r1
+
+ bx lr
+ .endfunc
+
+function ff_h264_h_loop_filter_chroma_neon, export=1
+ h264_loop_filter_start
+
+ sub r0, r0, #2
+ vld1.32 {d18[0]}, [r0], r1
+ vld1.32 {d16[0]}, [r0], r1
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d2[0]}, [r0], r1
+ vld1.32 {d18[1]}, [r0], r1
+ vld1.32 {d16[1]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d2[1]}, [r0], r1
+
+ vtrn.16 d18, d0
+ vtrn.16 d16, d2
+ vtrn.8 d18, d16
+ vtrn.8 d0, d2
+
+ h264_loop_filter_chroma
+
+ vtrn.16 d18, d0
+ vtrn.16 d16, d2
+ vtrn.8 d18, d16
+ vtrn.8 d0, d2
+
+ sub r0, r0, r1, lsl #3
+ vst1.32 {d18[0]}, [r0], r1
+ vst1.32 {d16[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d2[0]}, [r0], r1
+ vst1.32 {d18[1]}, [r0], r1
+ vst1.32 {d16[1]}, [r0], r1
+ vst1.32 {d0[1]}, [r0], r1
+ vst1.32 {d2[1]}, [r0], r1
+
+ bx lr
+ .endfunc
+
+ /* H.264 qpel MC */
+
+ .macro lowpass_const r
+ movw \r, #5
+ movt \r, #20
+ vmov.32 d6[0], \r
+ .endm
+
+ .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
+.if \narrow
+ t0 .req q0
+ t1 .req q8
+.else
+ t0 .req \d0
+ t1 .req \d1
+.endif
+ vext.8 d2, \r0, \r1, #2
+ vext.8 d3, \r0, \r1, #3
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, \r0, \r1, #1
+ vext.8 d5, \r0, \r1, #4
+ vaddl.u8 q2, d4, d5
+ vext.8 d30, \r0, \r1, #5
+ vaddl.u8 t0, \r0, d30
+ vext.8 d18, \r2, \r3, #2
+ vmla.i16 t0, q1, d6[1]
+ vext.8 d19, \r2, \r3, #3
+ vaddl.u8 q9, d18, d19
+ vext.8 d20, \r2, \r3, #1
+ vmls.i16 t0, q2, d6[0]
+ vext.8 d21, \r2, \r3, #4
+ vaddl.u8 q10, d20, d21
+ vext.8 d31, \r2, \r3, #5
+ vaddl.u8 t1, \r2, d31
+ vmla.i16 t1, q9, d6[1]
+ vmls.i16 t1, q10, d6[0]
+.if \narrow
+ vqrshrun.s16 \d0, t0, #5
+ vqrshrun.s16 \d1, t1, #5
+.endif
+ .unreq t0
+ .unreq t1
+ .endm
+
+ .macro lowpass_8_1 r0, r1, d0, narrow=1
+.if \narrow
+ t0 .req q0
+.else
+ t0 .req \d0
+.endif
+ vext.8 d2, \r0, \r1, #2
+ vext.8 d3, \r0, \r1, #3
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, \r0, \r1, #1
+ vext.8 d5, \r0, \r1, #4
+ vaddl.u8 q2, d4, d5
+ vext.8 d30, \r0, \r1, #5
+ vaddl.u8 t0, \r0, d30
+ vmla.i16 t0, q1, d6[1]
+ vmls.i16 t0, q2, d6[0]
+.if \narrow
+ vqrshrun.s16 \d0, t0, #5
+.endif
+ .unreq t0
+ .endm
+
+ .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
+ vext.16 q1, \r0, \r1, #2
+ vext.16 q0, \r0, \r1, #3
+ vaddl.s16 q9, d2, d0
+ vext.16 q2, \r0, \r1, #1
+ vaddl.s16 q1, d3, d1
+ vext.16 q3, \r0, \r1, #4
+ vaddl.s16 q10, d4, d6
+ vext.16 \r1, \r0, \r1, #5
+ vaddl.s16 q2, d5, d7
+ vaddl.s16 q0, \h0, \h1
+ vaddl.s16 q8, \l0, \l1
+
+ vshl.i32 q3, q9, #4
+ vshl.i32 q9, q9, #2
+ vshl.i32 q15, q10, #2
+ vadd.i32 q9, q9, q3
+ vadd.i32 q10, q10, q15
+
+ vshl.i32 q3, q1, #4
+ vshl.i32 q1, q1, #2
+ vshl.i32 q15, q2, #2
+ vadd.i32 q1, q1, q3
+ vadd.i32 q2, q2, q15
+
+ vadd.i32 q9, q9, q8
+ vsub.i32 q9, q9, q10
+
+ vadd.i32 q1, q1, q0
+ vsub.i32 q1, q1, q2
+
+ vrshrn.s32 d18, q9, #10
+ vrshrn.s32 d19, q1, #10
+
+ vqmovun.s16 \d, q9
+ .endm
+
+function put_h264_qpel16_h_lowpass_neon_packed
+ mov r4, lr
+ mov ip, #16
+ mov r3, #8
+ bl put_h264_qpel8_h_lowpass_neon
+ sub r1, r1, r2, lsl #4
+ add r1, r1, #8
+ mov ip, #16
+ mov lr, r4
+ b put_h264_qpel8_h_lowpass_neon
+ .endfunc
+
+function put_h264_qpel16_h_lowpass_neon
+ push {lr}
+ mov ip, #16
+ bl put_h264_qpel8_h_lowpass_neon
+ sub r0, r0, r3, lsl #4
+ sub r1, r1, r2, lsl #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov ip, #16
+ pop {lr}
+ .endfunc
+
+function put_h264_qpel8_h_lowpass_neon
+1: vld1.64 {d0, d1}, [r1], r2
+ vld1.64 {d16,d17}, [r1], r2
+ subs ip, ip, #2
+ lowpass_8 d0, d1, d16, d17, d0, d16
+ vst1.64 {d0}, [r0,:64], r3
+ vst1.64 {d16}, [r0,:64], r3
+ bne 1b
+ bx lr
+ .endfunc
+
+function put_h264_qpel16_h_lowpass_l2_neon
+ push {lr}
+ mov ip, #16
+ bl put_h264_qpel8_h_lowpass_l2_neon
+ sub r0, r0, r2, lsl #4
+ sub r1, r1, r2, lsl #4
+ sub r3, r3, r2, lsl #4
+ add r0, r0, #8
+ add r1, r1, #8
+ add r3, r3, #8
+ mov ip, #16
+ pop {lr}
+ .endfunc
+
+function put_h264_qpel8_h_lowpass_l2_neon
+1: vld1.64 {d0, d1}, [r1], r2
+ vld1.64 {d16,d17}, [r1], r2
+ vld1.64 {d28}, [r3], r2
+ vld1.64 {d29}, [r3], r2
+ subs ip, ip, #2
+ lowpass_8 d0, d1, d16, d17, d0, d1
+ vrhadd.u8 q0, q0, q14
+ vst1.64 {d0}, [r0,:64], r2
+ vst1.64 {d1}, [r0,:64], r2
+ bne 1b
+ bx lr
+ .endfunc
+
+function put_h264_qpel16_v_lowpass_neon_packed
+ mov r4, lr
+ mov r2, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ bl put_h264_qpel8_v_lowpass_neon
+ sub r1, r1, r3, lsl #4
+ sub r1, r1, r3, lsl #2
+ add r1, r1, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ mov lr, r4
+ b put_h264_qpel8_v_lowpass_neon
+ .endfunc
+
+function put_h264_qpel16_v_lowpass_neon
+ mov r4, lr
+ bl put_h264_qpel8_v_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ bl put_h264_qpel8_v_lowpass_neon
+ sub r0, r0, r2, lsl #4
+ add r0, r0, #8
+ sub r1, r1, r3, lsl #4
+ sub r1, r1, r3, lsl #2
+ add r1, r1, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ mov lr, r4
+ .endfunc
+
+function put_h264_qpel8_v_lowpass_neon
+ vld1.64 {d8}, [r1], r3
+ vld1.64 {d10}, [r1], r3
+ vld1.64 {d12}, [r1], r3
+ vld1.64 {d14}, [r1], r3
+ vld1.64 {d22}, [r1], r3
+ vld1.64 {d24}, [r1], r3
+ vld1.64 {d26}, [r1], r3
+ vld1.64 {d28}, [r1], r3
+ vld1.64 {d9}, [r1], r3
+ vld1.64 {d11}, [r1], r3
+ vld1.64 {d13}, [r1], r3
+ vld1.64 {d15}, [r1], r3
+ vld1.64 {d23}, [r1]
+
+ transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
+ lowpass_8 d8, d9, d10, d11, d8, d10
+ lowpass_8 d12, d13, d14, d15, d12, d14
+ lowpass_8 d22, d23, d24, d25, d22, d24
+ lowpass_8 d26, d27, d28, d29, d26, d28
+ transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
+
+ vst1.64 {d8}, [r0,:64], r2
+ vst1.64 {d10}, [r0,:64], r2
+ vst1.64 {d12}, [r0,:64], r2
+ vst1.64 {d14}, [r0,:64], r2
+ vst1.64 {d22}, [r0,:64], r2
+ vst1.64 {d24}, [r0,:64], r2
+ vst1.64 {d26}, [r0,:64], r2
+ vst1.64 {d28}, [r0,:64], r2
+
+ bx lr
+ .endfunc
+
+function put_h264_qpel16_v_lowpass_l2_neon
+ mov r4, lr
+ bl put_h264_qpel8_v_lowpass_l2_neon
+ sub r1, r1, r3, lsl #2
+ bl put_h264_qpel8_v_lowpass_l2_neon
+ sub r0, r0, r3, lsl #4
+ sub ip, ip, r2, lsl #4
+ add r0, r0, #8
+ add ip, ip, #8
+ sub r1, r1, r3, lsl #4
+ sub r1, r1, r3, lsl #2
+ add r1, r1, #8
+ bl put_h264_qpel8_v_lowpass_l2_neon
+ sub r1, r1, r3, lsl #2
+ mov lr, r4
+ .endfunc
+
+function put_h264_qpel8_v_lowpass_l2_neon
+ vld1.64 {d8}, [r1], r3
+ vld1.64 {d10}, [r1], r3
+ vld1.64 {d12}, [r1], r3
+ vld1.64 {d14}, [r1], r3
+ vld1.64 {d22}, [r1], r3
+ vld1.64 {d24}, [r1], r3
+ vld1.64 {d26}, [r1], r3
+ vld1.64 {d28}, [r1], r3
+ vld1.64 {d9}, [r1], r3
+ vld1.64 {d11}, [r1], r3
+ vld1.64 {d13}, [r1], r3
+ vld1.64 {d15}, [r1], r3
+ vld1.64 {d23}, [r1]
+
+ transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
+ lowpass_8 d8, d9, d10, d11, d8, d9
+ lowpass_8 d12, d13, d14, d15, d12, d13
+ lowpass_8 d22, d23, d24, d25, d22, d23
+ lowpass_8 d26, d27, d28, d29, d26, d27
+ transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
+
+ vld1.64 {d0}, [ip], r2
+ vld1.64 {d1}, [ip], r2
+ vld1.64 {d2}, [ip], r2
+ vld1.64 {d3}, [ip], r2
+ vld1.64 {d4}, [ip], r2
+ vrhadd.u8 q0, q0, q4
+ vld1.64 {d5}, [ip], r2
+ vrhadd.u8 q1, q1, q6
+ vld1.64 {d10}, [ip], r2
+ vrhadd.u8 q2, q2, q11
+ vld1.64 {d11}, [ip], r2
+
+ vst1.64 {d0}, [r0,:64], r3
+ vst1.64 {d1}, [r0,:64], r3
+ vrhadd.u8 q5, q5, q13
+ vst1.64 {d2}, [r0,:64], r3
+ vst1.64 {d3}, [r0,:64], r3
+ vst1.64 {d4}, [r0,:64], r3
+ vst1.64 {d5}, [r0,:64], r3
+ vst1.64 {d10}, [r0,:64], r3
+ vst1.64 {d11}, [r0,:64], r3
+
+ bx lr
+ .endfunc
+
+function put_h264_qpel8_hv_lowpass_neon_top
+ lowpass_const ip
+ mov ip, #12
+1: vld1.64 {d0, d1}, [r1], r3
+ vld1.64 {d16,d17}, [r1], r3
+ subs ip, ip, #2
+ lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
+ vst1.64 {d22-d25}, [r4,:128]!
+ bne 1b
+
+ vld1.64 {d0, d1}, [r1]
+ lowpass_8_1 d0, d1, q12, narrow=0
+
+ mov ip, #-16
+ add r4, r4, ip
+ vld1.64 {d30,d31}, [r4,:128], ip
+ vld1.64 {d20,d21}, [r4,:128], ip
+ vld1.64 {d18,d19}, [r4,:128], ip
+ vld1.64 {d16,d17}, [r4,:128], ip
+ vld1.64 {d14,d15}, [r4,:128], ip
+ vld1.64 {d12,d13}, [r4,:128], ip
+ vld1.64 {d10,d11}, [r4,:128], ip
+ vld1.64 {d8, d9}, [r4,:128], ip
+ vld1.64 {d6, d7}, [r4,:128], ip
+ vld1.64 {d4, d5}, [r4,:128], ip
+ vld1.64 {d2, d3}, [r4,:128], ip
+ vld1.64 {d0, d1}, [r4,:128]
+
+ swap4 d1, d3, d5, d7, d8, d10, d12, d14
+ transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
+
+ swap4 d17, d19, d21, d31, d24, d26, d28, d22
+ transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
+
+ vst1.64 {d30,d31}, [r4,:128]!
+ vst1.64 {d6, d7}, [r4,:128]!
+ vst1.64 {d20,d21}, [r4,:128]!
+ vst1.64 {d4, d5}, [r4,:128]!
+ vst1.64 {d18,d19}, [r4,:128]!
+ vst1.64 {d2, d3}, [r4,:128]!
+ vst1.64 {d16,d17}, [r4,:128]!
+ vst1.64 {d0, d1}, [r4,:128]
+
+ lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
+ lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
+ lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
+ lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
+
+ vld1.64 {d16,d17}, [r4,:128], ip
+ vld1.64 {d30,d31}, [r4,:128], ip
+ lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
+ vld1.64 {d16,d17}, [r4,:128], ip
+ vld1.64 {d30,d31}, [r4,:128], ip
+ lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
+ vld1.64 {d16,d17}, [r4,:128], ip
+ vld1.64 {d30,d31}, [r4,:128], ip
+ lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
+ vld1.64 {d16,d17}, [r4,:128], ip
+ vld1.64 {d30,d31}, [r4,:128]
+ lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
+
+ transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
+
+ bx lr
+ .endfunc
+
+function put_h264_qpel8_hv_lowpass_neon
+ mov r10, lr
+ bl put_h264_qpel8_hv_lowpass_neon_top
+ vst1.64 {d12}, [r0,:64], r2
+ vst1.64 {d13}, [r0,:64], r2
+ vst1.64 {d14}, [r0,:64], r2
+ vst1.64 {d15}, [r0,:64], r2
+ vst1.64 {d8}, [r0,:64], r2
+ vst1.64 {d9}, [r0,:64], r2
+ vst1.64 {d10}, [r0,:64], r2
+ vst1.64 {d11}, [r0,:64], r2
+
+ mov lr, r10
+ bx lr
+ .endfunc
+
+function put_h264_qpel8_hv_lowpass_l2_neon
+ mov r10, lr
+ bl put_h264_qpel8_hv_lowpass_neon_top
+
+ vld1.64 {d0, d1}, [r2,:128]!
+ vld1.64 {d2, d3}, [r2,:128]!
+ vrhadd.u8 q0, q0, q6
+ vld1.64 {d4, d5}, [r2,:128]!
+ vrhadd.u8 q1, q1, q7
+ vld1.64 {d6, d7}, [r2,:128]!
+ vrhadd.u8 q2, q2, q4
+
+ vst1.64 {d0}, [r0,:64], r3
+ vrhadd.u8 q3, q3, q5
+ vst1.64 {d1}, [r0,:64], r3
+ vst1.64 {d2}, [r0,:64], r3
+ vst1.64 {d3}, [r0,:64], r3
+ vst1.64 {d4}, [r0,:64], r3
+ vst1.64 {d5}, [r0,:64], r3
+ vst1.64 {d6}, [r0,:64], r3
+ vst1.64 {d7}, [r0,:64], r3
+
+ mov lr, r10
+ bx lr
+ .endfunc
+
+function put_h264_qpel16_hv_lowpass_neon
+ mov r9, lr
+ bl put_h264_qpel8_hv_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ bl put_h264_qpel8_hv_lowpass_neon
+ sub r1, r1, r3, lsl #4
+ sub r1, r1, r3, lsl #2
+ add r1, r1, #8
+ sub r0, r0, r2, lsl #4
+ add r0, r0, #8
+ bl put_h264_qpel8_hv_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ mov lr, r9
+ b put_h264_qpel8_hv_lowpass_neon
+ .endfunc
+
+function put_h264_qpel16_hv_lowpass_l2_neon
+ mov r9, lr
+ sub r2, r4, #256
+ bl put_h264_qpel8_hv_lowpass_l2_neon
+ sub r1, r1, r3, lsl #2
+ bl put_h264_qpel8_hv_lowpass_l2_neon
+ sub r1, r1, r3, lsl #4
+ sub r1, r1, r3, lsl #2
+ add r1, r1, #8
+ sub r0, r0, r3, lsl #4
+ add r0, r0, #8
+ bl put_h264_qpel8_hv_lowpass_l2_neon
+ sub r1, r1, r3, lsl #2
+ mov lr, r9
+ b put_h264_qpel8_hv_lowpass_l2_neon
+ .endfunc
+
+function ff_put_h264_qpel8_mc10_neon, export=1
+ lowpass_const r3
+ mov r3, r1
+ sub r1, r1, #2
+ mov ip, #8
+ b put_h264_qpel8_h_lowpass_l2_neon
+ .endfunc
+
+function ff_put_h264_qpel8_mc20_neon, export=1
+ lowpass_const r3
+ sub r1, r1, #2
+ mov r3, r2
+ mov ip, #8
+ b put_h264_qpel8_h_lowpass_neon
+ .endfunc
+
+function ff_put_h264_qpel8_mc30_neon, export=1
+ lowpass_const r3
+ add r3, r1, #1
+ sub r1, r1, #2
+ mov ip, #8
+ b put_h264_qpel8_h_lowpass_l2_neon
+ .endfunc
+
+function ff_put_h264_qpel8_mc01_neon, export=1
+ push {lr}
+ mov ip, r1
+put_h264_qpel8_mc01:
+ lowpass_const r3
+ mov r3, r2
+ sub r1, r1, r2, lsl #1
+ vpush {d8-d15}
+ bl put_h264_qpel8_v_lowpass_l2_neon
+ vpop {d8-d15}
+ pop {pc}
+ .endfunc
+
+function ff_put_h264_qpel8_mc11_neon, export=1
+ push {r0, r1, r2, lr}
+put_h264_qpel8_mc11:
+ lowpass_const r3
+ sub sp, sp, #64
+ mov r0, sp
+ sub r1, r1, #2
+ mov r3, #8
+ mov ip, #8
+ vpush {d8-d15}
+ bl put_h264_qpel8_h_lowpass_neon
+ ldrd r0, [sp, #128]
+ mov r3, r2
+ add ip, sp, #64
+ sub r1, r1, r2, lsl #1
+ mov r2, #8
+ bl put_h264_qpel8_v_lowpass_l2_neon
+ vpop {d8-d15}
+ add sp, sp, #76
+ pop {pc}
+ .endfunc
+
+function ff_put_h264_qpel8_mc21_neon, export=1
+ push {r0, r1, r4, r10, r11, lr}
+put_h264_qpel8_mc21:
+ lowpass_const r3
+ mov r11, sp
+ bic sp, sp, #15
+ sub sp, sp, #(8*8+16*12)
+ sub r1, r1, #2
+ mov r3, #8
+ mov r0, sp
+ mov ip, #8
+ vpush {d8-d15}
+ bl put_h264_qpel8_h_lowpass_neon
+ mov r4, r0
+ ldrd r0, [r11]
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, r2
+ sub r2, r4, #64
+ bl put_h264_qpel8_hv_lowpass_l2_neon
+ vpop {d8-d15}
+ add sp, r11, #8
+ pop {r4, r10, r11, pc}
+ .endfunc
+
+function ff_put_h264_qpel8_mc31_neon, export=1
+ add r1, r1, #1
+ push {r0, r1, r2, lr}
+ sub r1, r1, #1
+ b put_h264_qpel8_mc11
+ .endfunc
+
+function ff_put_h264_qpel8_mc02_neon, export=1
+ push {lr}
+ lowpass_const r3
+ sub r1, r1, r2, lsl #1
+ mov r3, r2
+ vpush {d8-d15}
+ bl put_h264_qpel8_v_lowpass_neon
+ vpop {d8-d15}
+ pop {pc}
+ .endfunc
+
+function ff_put_h264_qpel8_mc12_neon, export=1
+ push {r0, r1, r4, r10, r11, lr}
+put_h264_qpel8_mc12:
+ lowpass_const r3
+ mov r11, sp
+ bic sp, sp, #15
+ sub sp, sp, #(8*8+16*12)
+ sub r1, r1, r2, lsl #1
+ mov r3, r2
+ mov r2, #8
+ mov r0, sp
+ vpush {d8-d15}
+ bl put_h264_qpel8_v_lowpass_neon
+ mov r4, r0
+ ldrd r0, [r11]
+ sub r1, r1, r3, lsl #1
+ sub r1, r1, #2
+ sub r2, r4, #64
+ bl put_h264_qpel8_hv_lowpass_l2_neon
+ vpop {d8-d15}
+ add sp, r11, #8
+ pop {r4, r10, r11, pc}
+ .endfunc
+
+function ff_put_h264_qpel8_mc22_neon, export=1
+ push {r4, r10, r11, lr}
+ mov r11, sp
+ bic sp, sp, #15
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, r2
+ sub sp, sp, #(16*12)
+ mov r4, sp
+ vpush {d8-d15}
+ bl put_h264_qpel8_hv_lowpass_neon
+ vpop {d8-d15}
+ mov sp, r11
+ pop {r4, r10, r11, pc}
+ .endfunc
+
+function ff_put_h264_qpel8_mc32_neon, export=1
+ push {r0, r1, r4, r10, r11, lr}
+ add r1, r1, #1
+ b put_h264_qpel8_mc12
+ .endfunc
+
+function ff_put_h264_qpel8_mc03_neon, export=1
+ push {lr}
+ add ip, r1, r2
+ b put_h264_qpel8_mc01
+ .endfunc
+
+function ff_put_h264_qpel8_mc13_neon, export=1
+ push {r0, r1, r2, lr}
+ add r1, r1, r2
+ b put_h264_qpel8_mc11
+ .endfunc
+
+function ff_put_h264_qpel8_mc23_neon, export=1
+ push {r0, r1, r4, r10, r11, lr}
+ add r1, r1, r2
+ b put_h264_qpel8_mc21
+ .endfunc
+
+function ff_put_h264_qpel8_mc33_neon, export=1
+ add r1, r1, #1
+ push {r0, r1, r2, lr}
+ add r1, r1, r2
+ sub r1, r1, #1
+ b put_h264_qpel8_mc11
+ .endfunc
+
+function ff_put_h264_qpel16_mc10_neon, export=1
+ lowpass_const r3
+ mov r3, r1
+ sub r1, r1, #2
+ b put_h264_qpel16_h_lowpass_l2_neon
+ .endfunc
+
+function ff_put_h264_qpel16_mc20_neon, export=1
+ lowpass_const r3
+ sub r1, r1, #2
+ mov r3, r2
+ b put_h264_qpel16_h_lowpass_neon
+ .endfunc
+
+function ff_put_h264_qpel16_mc30_neon, export=1
+ lowpass_const r3
+ add r3, r1, #1
+ sub r1, r1, #2
+ b put_h264_qpel16_h_lowpass_l2_neon
+ .endfunc
+
+function ff_put_h264_qpel16_mc01_neon, export=1
+ push {r4, lr}
+ mov ip, r1
+put_h264_qpel16_mc01:
+ lowpass_const r3
+ mov r3, r2
+ sub r1, r1, r2, lsl #1
+ vpush {d8-d15}
+ bl put_h264_qpel16_v_lowpass_l2_neon
+ vpop {d8-d15}
+ pop {r4, pc}
+ .endfunc
+
+function ff_put_h264_qpel16_mc11_neon, export=1
+ push {r0, r1, r4, lr}
+put_h264_qpel16_mc11:
+ lowpass_const r3
+ sub sp, sp, #256
+ mov r0, sp
+ sub r1, r1, #2
+ mov r3, #16
+ vpush {d8-d15}
+ bl put_h264_qpel16_h_lowpass_neon
+ add r0, sp, #256
+ ldrd r0, [r0, #64]
+ mov r3, r2
+ add ip, sp, #64
+ sub r1, r1, r2, lsl #1
+ mov r2, #16
+ bl put_h264_qpel16_v_lowpass_l2_neon
+ vpop {d8-d15}
+ add sp, sp, #(256+8)
+ pop {r4, pc}
+ .endfunc
+
+function ff_put_h264_qpel16_mc21_neon, export=1
+ push {r0, r1, r4-r5, r9-r11, lr}
+put_h264_qpel16_mc21:
+ lowpass_const r3
+ mov r11, sp
+ bic sp, sp, #15
+ sub sp, sp, #(16*16+16*12)
+ sub r1, r1, #2
+ mov r0, sp
+ vpush {d8-d15}
+ bl put_h264_qpel16_h_lowpass_neon_packed
+ mov r4, r0
+ ldrd r0, [r11]
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, r2
+ bl put_h264_qpel16_hv_lowpass_l2_neon
+ vpop {d8-d15}
+ add sp, r11, #8
+ pop {r4-r5, r9-r11, pc}
+ .endfunc
+
+function ff_put_h264_qpel16_mc31_neon, export=1
+ add r1, r1, #1
+ push {r0, r1, r4, lr}
+ sub r1, r1, #1
+ b put_h264_qpel16_mc11
+ .endfunc
+
+function ff_put_h264_qpel16_mc02_neon, export=1
+ push {r4, lr}
+ lowpass_const r3
+ sub r1, r1, r2, lsl #1
+ mov r3, r2
+ vpush {d8-d15}
+ bl put_h264_qpel16_v_lowpass_neon
+ vpop {d8-d15}
+ pop {r4, pc}
+ .endfunc
+
+function ff_put_h264_qpel16_mc12_neon, export=1
+ push {r0, r1, r4-r5, r9-r11, lr}
+put_h264_qpel16_mc12:
+ lowpass_const r3
+ mov r11, sp
+ bic sp, sp, #15
+ sub sp, sp, #(16*16+16*12)
+ sub r1, r1, r2, lsl #1
+ mov r0, sp
+ mov r3, r2
+ vpush {d8-d15}
+ bl put_h264_qpel16_v_lowpass_neon_packed
+ mov r4, r0
+ ldrd r0, [r11]
+ sub r1, r1, r3, lsl #1
+ sub r1, r1, #2
+ mov r2, r3
+ bl put_h264_qpel16_hv_lowpass_l2_neon
+ vpop {d8-d15}
+ add sp, r11, #8
+ pop {r4-r5, r9-r11, pc}
+ .endfunc
+
+function ff_put_h264_qpel16_mc22_neon, export=1
+ push {r4, r9-r11, lr}
+ lowpass_const r3
+ mov r11, sp
+ bic sp, sp, #15
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, r2
+ sub sp, sp, #(16*12)
+ mov r4, sp
+ vpush {d8-d15}
+ bl put_h264_qpel16_hv_lowpass_neon
+ vpop {d8-d15}
+ mov sp, r11
+ pop {r4, r9-r11, pc}
+ .endfunc
+
+function ff_put_h264_qpel16_mc32_neon, export=1
+ push {r0, r1, r4-r5, r9-r11, lr}
+ add r1, r1, #1
+ b put_h264_qpel16_mc12
+ .endfunc
+
+function ff_put_h264_qpel16_mc03_neon, export=1
+ push {r4, lr}
+ add ip, r1, r2
+ b put_h264_qpel16_mc01
+ .endfunc
+
+function ff_put_h264_qpel16_mc13_neon, export=1
+ push {r0, r1, r4, lr}
+ add r1, r1, r2
+ b put_h264_qpel16_mc11
+ .endfunc
+
+function ff_put_h264_qpel16_mc23_neon, export=1
+ push {r0, r1, r4-r5, r9-r11, lr}
+ add r1, r1, r2
+ b put_h264_qpel16_mc21
+ .endfunc
+
+function ff_put_h264_qpel16_mc33_neon, export=1
+ add r1, r1, #1
+ push {r0, r1, r4, lr}
+ add r1, r1, r2
+ sub r1, r1, #1
+ b put_h264_qpel16_mc11
+ .endfunc
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
new file mode 100644
index 0000000000..b7ef2f4519
--- /dev/null
+++ b/libavcodec/arm/h264idct_neon.S
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+ .fpu neon
+
+ .text
+
+function ff_h264_idct_add_neon, export=1
+ mov r3, #(1<<5)
+ vmov.i16 d16, #0
+ vmov.16 d16[0], r3
+ vld1.64 {d0-d3}, [r1,:128]
+ vadd.i16 d0, d0, d16
+
+ vswp d1, d2
+ vadd.i16 d4, d0, d1
+ vshr.s16 q8, q1, #1
+ vsub.i16 d5, d0, d1
+ vadd.i16 d6, d2, d17
+ vsub.i16 d7, d16, d3
+ vadd.i16 q0, q2, q3
+ vsub.i16 q1, q2, q3
+
+ vtrn.16 d0, d1
+ vtrn.16 d3, d2
+ vtrn.32 d0, d3
+ vtrn.32 d1, d2
+
+ vadd.i16 d4, d0, d3
+ vld1.32 {d18[0]}, [r0,:32], r2
+ vswp d1, d3
+ vshr.s16 q8, q1, #1
+ vld1.32 {d19[1]}, [r0,:32], r2
+ vsub.i16 d5, d0, d1
+ vld1.32 {d18[1]}, [r0,:32], r2
+ vadd.i16 d6, d16, d3
+ vld1.32 {d19[0]}, [r0,:32], r2
+ vsub.i16 d7, d2, d17
+ sub r0, r0, r2, lsl #2
+ vadd.i16 q0, q2, q3
+ vsub.i16 q1, q2, q3
+
+ vshr.s16 q0, q0, #6
+ vshr.s16 q1, q1, #6
+
+ vaddw.u8 q0, q0, d18
+ vaddw.u8 q1, q1, d19
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+
+ vst1.32 {d0[0]}, [r0,:32], r2
+ vst1.32 {d1[1]}, [r0,:32], r2
+ vst1.32 {d0[1]}, [r0,:32], r2
+ vst1.32 {d1[0]}, [r0,:32], r2
+
+ bx lr
+ .endfunc
+
+function ff_h264_idct_dc_add_neon, export=1
+ vld1.16 {d2[],d3[]}, [r1,:16]
+ vrshr.s16 q1, q1, #6
+ vld1.32 {d0[0]}, [r0,:32], r2
+ vld1.32 {d0[1]}, [r0,:32], r2
+ vaddw.u8 q2, q1, d0
+ vld1.32 {d1[0]}, [r0,:32], r2
+ vld1.32 {d1[1]}, [r0,:32], r2
+ vaddw.u8 q1, q1, d1
+ vqmovun.s16 d0, q2
+ vqmovun.s16 d1, q1
+ sub r0, r0, r2, lsl #2
+ vst1.32 {d0[0]}, [r0,:32], r2
+ vst1.32 {d0[1]}, [r0,:32], r2
+ vst1.32 {d1[0]}, [r0,:32], r2
+ vst1.32 {d1[1]}, [r0,:32], r2
+ bx lr
+ .endfunc
diff --git a/libavcodec/arm/jrevdct_arm.S b/libavcodec/arm/jrevdct_arm.S
new file mode 100644
index 0000000000..b77315b0f5
--- /dev/null
+++ b/libavcodec/arm/jrevdct_arm.S
@@ -0,0 +1,388 @@
+/*
+ C-like prototype :
+ void j_rev_dct_ARM(DCTBLOCK data)
+
+ With DCTBLOCK being a pointer to an array of 64 'signed shorts'
+
+ Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+
+#include "asm.S"
+
+#define FIX_0_298631336 2446
+#define FIX_0_541196100 4433
+#define FIX_0_765366865 6270
+#define FIX_1_175875602 9633
+#define FIX_1_501321110 12299
+#define FIX_2_053119869 16819
+#define FIX_3_072711026 25172
+#define FIX_M_0_390180644 -3196
+#define FIX_M_0_899976223 -7373
+#define FIX_M_1_847759065 -15137
+#define FIX_M_1_961570560 -16069
+#define FIX_M_2_562915447 -20995
+#define FIX_0xFFFF 0xFFFF
+
+#define FIX_0_298631336_ID 0
+#define FIX_0_541196100_ID 4
+#define FIX_0_765366865_ID 8
+#define FIX_1_175875602_ID 12
+#define FIX_1_501321110_ID 16
+#define FIX_2_053119869_ID 20
+#define FIX_3_072711026_ID 24
+#define FIX_M_0_390180644_ID 28
+#define FIX_M_0_899976223_ID 32
+#define FIX_M_1_847759065_ID 36
+#define FIX_M_1_961570560_ID 40
+#define FIX_M_2_562915447_ID 44
+#define FIX_0xFFFF_ID 48
+ .text
+ .align
+
+function j_rev_dct_ARM, export=1
+ stmdb sp!, { r4 - r12, lr } @ all callee saved regs
+
+ sub sp, sp, #4 @ reserve some space on the stack
+ str r0, [ sp ] @ save the DCT pointer to the stack
+
+ mov lr, r0 @ lr = pointer to the current row
+ mov r12, #8 @ r12 = row-counter
+ add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array
+row_loop:
+ ldrsh r0, [lr, # 0] @ r0 = 'd0'
+ ldrsh r2, [lr, # 2] @ r2 = 'd2'
+
+ @ Optimization for row that have all items except the first set to 0
+ @ (this works as the DCTELEMS are always 4-byte aligned)
+ ldr r5, [lr, # 0]
+ ldr r6, [lr, # 4]
+ ldr r3, [lr, # 8]
+ ldr r4, [lr, #12]
+ orr r3, r3, r4
+ orr r3, r3, r6
+ orrs r5, r3, r5
+ beq end_of_row_loop @ nothing to be done as ALL of them are '0'
+ orrs r3, r3, r2
+ beq empty_row
+
+ ldrsh r1, [lr, # 8] @ r1 = 'd1'
+ ldrsh r4, [lr, # 4] @ r4 = 'd4'
+ ldrsh r6, [lr, # 6] @ r6 = 'd6'
+
+ ldr r3, [r11, #FIX_0_541196100_ID]
+ add r7, r2, r6
+ ldr r5, [r11, #FIX_M_1_847759065_ID]
+ mul r7, r3, r7 @ r7 = z1
+ ldr r3, [r11, #FIX_0_765366865_ID]
+ mla r6, r5, r6, r7 @ r6 = tmp2
+ add r5, r0, r4 @ r5 = tmp0
+ mla r2, r3, r2, r7 @ r2 = tmp3
+ sub r3, r0, r4 @ r3 = tmp1
+
+ add r0, r2, r5, lsl #13 @ r0 = tmp10
+ rsb r2, r2, r5, lsl #13 @ r2 = tmp13
+ add r4, r6, r3, lsl #13 @ r4 = tmp11
+ rsb r3, r6, r3, lsl #13 @ r3 = tmp12
+
+ stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
+
+ ldrsh r3, [lr, #10] @ r3 = 'd3'
+ ldrsh r5, [lr, #12] @ r5 = 'd5'
+ ldrsh r7, [lr, #14] @ r7 = 'd7'
+
+ add r0, r3, r5 @ r0 = 'z2'
+ add r2, r1, r7 @ r2 = 'z1'
+ add r4, r3, r7 @ r4 = 'z3'
+ add r6, r1, r5 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_1_175875602_ID]
+ add r8, r4, r6 @ r8 = z3 + z4
+ ldr r10, [r11, #FIX_M_0_899976223_ID]
+ mul r8, r9, r8 @ r8 = 'z5'
+ ldr r9, [r11, #FIX_M_2_562915447_ID]
+ mul r2, r10, r2 @ r2 = 'z1'
+ ldr r10, [r11, #FIX_M_1_961570560_ID]
+ mul r0, r9, r0 @ r0 = 'z2'
+ ldr r9, [r11, #FIX_M_0_390180644_ID]
+ mla r4, r10, r4, r8 @ r4 = 'z3'
+ ldr r10, [r11, #FIX_0_298631336_ID]
+ mla r6, r9, r6, r8 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_2_053119869_ID]
+ mla r7, r10, r7, r2 @ r7 = tmp0 + z1
+ ldr r10, [r11, #FIX_3_072711026_ID]
+ mla r5, r9, r5, r0 @ r5 = tmp1 + z2
+ ldr r9, [r11, #FIX_1_501321110_ID]
+ mla r3, r10, r3, r0 @ r3 = tmp2 + z2
+ add r7, r7, r4 @ r7 = tmp0
+ mla r1, r9, r1, r2 @ r1 = tmp3 + z1
+ add r5, r5, r6 @ r5 = tmp1
+ add r3, r3, r4 @ r3 = tmp2
+ add r1, r1, r6 @ r1 = tmp3
+
+ ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
+ @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
+
+ @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
+ add r8, r0, r1
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 0]
+
+ @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
+ sub r8, r0, r1
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, #14]
+
+ @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
+ add r8, r6, r3
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 2]
+
+ @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
+ sub r8, r6, r3
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, #12]
+
+ @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
+ add r8, r4, r5
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 4]
+
+ @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
+ sub r8, r4, r5
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, #10]
+
+ @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
+ add r8, r2, r7
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 6]
+
+ @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
+ sub r8, r2, r7
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 8]
+
+ @ End of row loop
+ add lr, lr, #16
+ subs r12, r12, #1
+ bne row_loop
+ beq start_column_loop
+
+empty_row:
+ ldr r1, [r11, #FIX_0xFFFF_ID]
+ mov r0, r0, lsl #2
+ and r0, r0, r1
+ add r0, r0, r0, lsl #16
+ str r0, [lr, # 0]
+ str r0, [lr, # 4]
+ str r0, [lr, # 8]
+ str r0, [lr, #12]
+
+end_of_row_loop:
+ @ End of loop
+ add lr, lr, #16
+ subs r12, r12, #1
+ bne row_loop
+
+start_column_loop:
+ @ Start of column loop
+ ldr lr, [ sp ]
+ mov r12, #8
+column_loop:
+ ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
+ ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
+ ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
+ ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
+
+ ldr r3, [r11, #FIX_0_541196100_ID]
+ add r1, r2, r6
+ ldr r5, [r11, #FIX_M_1_847759065_ID]
+ mul r1, r3, r1 @ r1 = z1
+ ldr r3, [r11, #FIX_0_765366865_ID]
+ mla r6, r5, r6, r1 @ r6 = tmp2
+ add r5, r0, r4 @ r5 = tmp0
+ mla r2, r3, r2, r1 @ r2 = tmp3
+ sub r3, r0, r4 @ r3 = tmp1
+
+ add r0, r2, r5, lsl #13 @ r0 = tmp10
+ rsb r2, r2, r5, lsl #13 @ r2 = tmp13
+ add r4, r6, r3, lsl #13 @ r4 = tmp11
+ rsb r6, r6, r3, lsl #13 @ r6 = tmp12
+
+ ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
+ ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
+ ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
+ ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
+
+ @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
+ orr r9, r1, r3
+ orr r10, r5, r7
+ orrs r10, r9, r10
+ beq empty_odd_column
+
+ stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
+
+ add r0, r3, r5 @ r0 = 'z2'
+ add r2, r1, r7 @ r2 = 'z1'
+ add r4, r3, r7 @ r4 = 'z3'
+ add r6, r1, r5 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_1_175875602_ID]
+ add r8, r4, r6
+ ldr r10, [r11, #FIX_M_0_899976223_ID]
+ mul r8, r9, r8 @ r8 = 'z5'
+ ldr r9, [r11, #FIX_M_2_562915447_ID]
+ mul r2, r10, r2 @ r2 = 'z1'
+ ldr r10, [r11, #FIX_M_1_961570560_ID]
+ mul r0, r9, r0 @ r0 = 'z2'
+ ldr r9, [r11, #FIX_M_0_390180644_ID]
+ mla r4, r10, r4, r8 @ r4 = 'z3'
+ ldr r10, [r11, #FIX_0_298631336_ID]
+ mla r6, r9, r6, r8 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_2_053119869_ID]
+ mla r7, r10, r7, r2 @ r7 = tmp0 + z1
+ ldr r10, [r11, #FIX_3_072711026_ID]
+ mla r5, r9, r5, r0 @ r5 = tmp1 + z2
+ ldr r9, [r11, #FIX_1_501321110_ID]
+ mla r3, r10, r3, r0 @ r3 = tmp2 + z2
+ add r7, r7, r4 @ r7 = tmp0
+ mla r1, r9, r1, r2 @ r1 = tmp3 + z1
+ add r5, r5, r6 @ r5 = tmp1
+ add r3, r3, r4 @ r3 = tmp2
+ add r1, r1, r6 @ r1 = tmp3
+
+ ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
+ @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
+
+ @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
+ add r8, r0, r1
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 0*8)]
+
+ @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
+ sub r8, r0, r1
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #(14*8)]
+
+ @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
+ add r8, r4, r3
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 2*8)]
+
+ @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
+ sub r8, r4, r3
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #(12*8)]
+
+ @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
+ add r8, r6, r5
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 4*8)]
+
+ @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
+ sub r8, r6, r5
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #(10*8)]
+
+ @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
+ add r8, r2, r7
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 6*8)]
+
+ @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
+ sub r8, r2, r7
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 8*8)]
+
+ @ End of row loop
+ add lr, lr, #2
+ subs r12, r12, #1
+ bne column_loop
+ beq the_end
+
+empty_odd_column:
+ @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
+ add r0, r0, #(1<<17)
+ mov r0, r0, asr #18
+ strh r0, [lr, #( 0*8)]
+ strh r0, [lr, #(14*8)]
+
+ @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
+ add r4, r4, #(1<<17)
+ mov r4, r4, asr #18
+ strh r4, [lr, #( 2*8)]
+ strh r4, [lr, #(12*8)]
+
+ @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
+ add r6, r6, #(1<<17)
+ mov r6, r6, asr #18
+ strh r6, [lr, #( 4*8)]
+ strh r6, [lr, #(10*8)]
+
+ @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
+ add r2, r2, #(1<<17)
+ mov r2, r2, asr #18
+ strh r2, [lr, #( 6*8)]
+ strh r2, [lr, #( 8*8)]
+
+ @ End of row loop
+ add lr, lr, #2
+ subs r12, r12, #1
+ bne column_loop
+
+the_end:
+ @ The end....
+ add sp, sp, #4
+ ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return
+
+const_array:
+ .align
+ .word FIX_0_298631336
+ .word FIX_0_541196100
+ .word FIX_0_765366865
+ .word FIX_1_175875602
+ .word FIX_1_501321110
+ .word FIX_2_053119869
+ .word FIX_3_072711026
+ .word FIX_M_0_390180644
+ .word FIX_M_0_899976223
+ .word FIX_M_1_847759065
+ .word FIX_M_1_961570560
+ .word FIX_M_2_562915447
+ .word FIX_0xFFFF
diff --git a/libavcodec/arm/mathops.h b/libavcodec/arm/mathops.h
new file mode 100644
index 0000000000..e36316c76b
--- /dev/null
+++ b/libavcodec/arm/mathops.h
@@ -0,0 +1,93 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_MATHOPS_H
+#define AVCODEC_ARM_MATHOPS_H
+
+#include <stdint.h>
+#include "libavutil/common.h"
+
+# define MULL MULL
+static inline av_const int MULL(int a, int b, unsigned shift)
+{
+ int lo, hi;
+ __asm__("smull %0, %1, %2, %3 \n\t"
+ "mov %0, %0, lsr %4 \n\t"
+ "add %1, %0, %1, lsl %5 \n\t"
+ : "=&r"(lo), "=&r"(hi)
+ : "r"(b), "r"(a), "i"(shift), "i"(32-shift));
+ return hi;
+}
+
+#define MULH MULH
+#ifdef HAVE_ARMV6
+static inline av_const int MULH(int a, int b)
+{
+ int r;
+ __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+ return r;
+}
+#else
+static inline av_const int MULH(int a, int b)
+{
+ int lo, hi;
+ __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a));
+ return hi;
+}
+#endif
+
+static inline av_const int64_t MUL64(int a, int b)
+{
+ union { uint64_t x; unsigned hl[2]; } x;
+ __asm__ ("smull %0, %1, %2, %3"
+ : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b));
+ return x.x;
+}
+#define MUL64 MUL64
+
+static inline av_const int64_t MAC64(int64_t d, int a, int b)
+{
+ union { uint64_t x; unsigned hl[2]; } x = { d };
+ __asm__ ("smlal %0, %1, %2, %3"
+ : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b));
+ return x.x;
+}
+#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
+#define MLS64(d, a, b) MAC64(d, -(a), b)
+
+#if defined(HAVE_ARMV5TE)
+
+/* signed 16x16 -> 32 multiply add accumulate */
+# define MAC16(rt, ra, rb) \
+ __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
+
+/* signed 16x16 -> 32 multiply */
+# define MUL16 MUL16
+static inline av_const MUL16(int ra, int rb)
+{
+ int rt;
+ __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
+ return rt;
+}
+
+#endif
+
+#endif /* AVCODEC_ARM_MATHOPS_H */
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
new file mode 100644
index 0000000000..18faed2d36
--- /dev/null
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2002 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+
+void MPV_common_init_iwmmxt(MpegEncContext *s);
+void MPV_common_init_armv5te(MpegEncContext *s);
+
+void MPV_common_init_arm(MpegEncContext *s)
+{
+ /* IWMMXT support is a superset of armv5te, so
+ * allow optimized functions for armv5te unless
+ * a better iwmmxt function exists
+ */
+#ifdef HAVE_ARMV5TE
+ MPV_common_init_armv5te(s);
+#endif
+#ifdef HAVE_IWMMXT
+ MPV_common_init_iwmmxt(s);
+#endif
+}
diff --git a/libavcodec/arm/mpegvideo_armv5te.c b/libavcodec/arm/mpegvideo_armv5te.c
new file mode 100644
index 0000000000..b213cf1de3
--- /dev/null
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -0,0 +1,100 @@
+/*
+ * Optimization of some functions from mpegvideo.c for armv5te
+ * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+
+void ff_dct_unquantize_h263_armv5te(DCTELEM *block, int qmul, int qadd, int count);
+
+#ifdef ENABLE_ARM_TESTS
+/**
+ * h263 dequantizer supplementary function, it is performance critical and needs to
+ * have optimized implementations for each architecture. Is also used as a reference
+ * implementation in regression tests
+ */
+static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count)
+{
+ int i, level;
+ for (i = 0; i < count; i++) {
+ level = block[i];
+ if (level) {
+ if (level < 0) {
+ level = level * qmul - qadd;
+ } else {
+ level = level * qmul + qadd;
+ }
+ block[i] = level;
+ }
+ }
+}
+#endif
+
+static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ int level, qmul, qadd;
+ int nCoeffs;
+
+ assert(s->block_last_index[n]>=0);
+
+ qmul = qscale << 1;
+
+ if (!s->h263_aic) {
+ if (n < 4)
+ level = block[0] * s->y_dc_scale;
+ else
+ level = block[0] * s->c_dc_scale;
+ qadd = (qscale - 1) | 1;
+ }else{
+ qadd = 0;
+ level = block[0];
+ }
+ if(s->ac_pred)
+ nCoeffs=63;
+ else
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+ ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
+ block[0] = level;
+}
+
+static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ int qmul, qadd;
+ int nCoeffs;
+
+ assert(s->block_last_index[n]>=0);
+
+ qadd = (qscale - 1) | 1;
+ qmul = qscale << 1;
+
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+ ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
+}
+
+void MPV_common_init_armv5te(MpegEncContext *s)
+{
+ s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
+ s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
+}
diff --git a/libavcodec/arm/mpegvideo_armv5te_s.S b/libavcodec/arm/mpegvideo_armv5te_s.S
new file mode 100644
index 0000000000..aaa252d3b7
--- /dev/null
+++ b/libavcodec/arm/mpegvideo_armv5te_s.S
@@ -0,0 +1,117 @@
+/*
+ * Optimization of some functions from mpegvideo.c for armv5te
+ * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+/*
+ * Special optimized version of dct_unquantize_h263_helper_c, it
+ * requires the block to be at least 8 bytes aligned, and may process
+ * more elements than requested. But it is guaranteed to never
+ * process more than 64 elements provided that count argument is <= 64,
+ * so it is safe. This function is optimized for a common distribution
+ * of values for nCoeffs (they are mostly multiple of 8 plus one or
+ * two extra elements). So this function processes data as 8 elements
+ * per loop iteration and contains optional 2 elements processing in
+ * the end.
+ *
+ * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
+ */
+function ff_dct_unquantize_h263_armv5te, export=1
+ push {r4-r9,lr}
+ mov ip, #0
+ subs r3, r3, #2
+ ble 2f
+ ldrd r4, [r0, #0]
+1:
+ ldrd r6, [r0, #8]
+
+ rsbs r9, ip, r4, asr #16
+ addgt r9, r2, #0
+ rsblt r9, r2, #0
+ smlatbne r9, r4, r1, r9
+
+ rsbs lr, ip, r5, asr #16
+ addgt lr, r2, #0
+ rsblt lr, r2, #0
+ smlatbne lr, r5, r1, lr
+
+ rsbs r8, ip, r4, asl #16
+ addgt r8, r2, #0
+ rsblt r8, r2, #0
+ smlabbne r4, r4, r1, r8
+
+ rsbs r8, ip, r5, asl #16
+ addgt r8, r2, #0
+ rsblt r8, r2, #0
+ smlabbne r5, r5, r1, r8
+
+ strh r4, [r0], #2
+ strh r9, [r0], #2
+ strh r5, [r0], #2
+ strh lr, [r0], #2
+
+ rsbs r9, ip, r6, asr #16
+ addgt r9, r2, #0
+ rsblt r9, r2, #0
+ smlatbne r9, r6, r1, r9
+
+ rsbs lr, ip, r7, asr #16
+ addgt lr, r2, #0
+ rsblt lr, r2, #0
+ smlatbne lr, r7, r1, lr
+
+ rsbs r8, ip, r6, asl #16
+ addgt r8, r2, #0
+ rsblt r8, r2, #0
+ smlabbne r6, r6, r1, r8
+
+ rsbs r8, ip, r7, asl #16
+ addgt r8, r2, #0
+ rsblt r8, r2, #0
+ smlabbne r7, r7, r1, r8
+
+ strh r6, [r0], #2
+ strh r9, [r0], #2
+ strh r7, [r0], #2
+ strh lr, [r0], #2
+
+ subs r3, r3, #8
+ ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
+ bgt 1b
+
+ adds r3, r3, #2
+ pople {r4-r9,pc}
+2:
+ ldrsh r9, [r0, #0]
+ ldrsh lr, [r0, #2]
+ mov r8, r2
+ cmp r9, #0
+ rsblt r8, r2, #0
+ smlabbne r9, r9, r1, r8
+ mov r8, r2
+ cmp lr, #0
+ rsblt r8, r2, #0
+ smlabbne lr, lr, r1, r8
+ strh r9, [r0], #2
+ strh lr, [r0], #2
+ pop {r4-r9,pc}
+ .endfunc
diff --git a/libavcodec/arm/mpegvideo_iwmmxt.c b/libavcodec/arm/mpegvideo_iwmmxt.c
new file mode 100644
index 0000000000..e816ac67c5
--- /dev/null
+++ b/libavcodec/arm/mpegvideo_iwmmxt.c
@@ -0,0 +1,119 @@
+/*
+ * copyright (c) 2004 AGAWA Koji
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/mpegvideo.h"
+
+static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ int level, qmul, qadd;
+ int nCoeffs;
+ DCTELEM *block_orig = block;
+
+ assert(s->block_last_index[n]>=0);
+
+ qmul = qscale << 1;
+
+ if (!s->h263_aic) {
+ if (n < 4)
+ level = block[0] * s->y_dc_scale;
+ else
+ level = block[0] * s->c_dc_scale;
+ qadd = (qscale - 1) | 1;
+ }else{
+ qadd = 0;
+ level = block[0];
+ }
+ if(s->ac_pred)
+ nCoeffs=63;
+ else
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+ __asm__ volatile (
+/* "movd %1, %%mm6 \n\t" //qmul */
+/* "packssdw %%mm6, %%mm6 \n\t" */
+/* "packssdw %%mm6, %%mm6 \n\t" */
+ "tbcsth wr6, %[qmul] \n\t"
+/* "movd %2, %%mm5 \n\t" //qadd */
+/* "packssdw %%mm5, %%mm5 \n\t" */
+/* "packssdw %%mm5, %%mm5 \n\t" */
+ "tbcsth wr5, %[qadd] \n\t"
+ "wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */
+ "wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */
+ "wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */
+ "1: \n\t"
+ "wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */
+ "wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */
+ "wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */
+ "wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */
+/* "movq (%0, %3), %%mm2 \n\t" */
+/* "movq 8(%0, %3), %%mm3 \n\t" */
+ "wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */
+ "wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */
+ "wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */
+ "wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */
+ "waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */
+ "waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */
+ "wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */
+ "wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */
+ "wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */
+ "wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */
+ "wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */
+ "wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */
+ "wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */
+ "wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */
+ "add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */
+ "subs %[i], %[i], #1 \n\t"
+ "bne 1b \n\t" /* "jng 1b \n\t" */
+ :[block]"+r"(block)
+ :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd)
+ :"memory");
+
+ block_orig[0] = level;
+}
+
+#if 0
+static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ int nCoeffs;
+
+ assert(s->block_last_index[n]>=0);
+
+ if(s->ac_pred)
+ nCoeffs=63;
+ else
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+ ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale);
+}
+#endif
+
+void MPV_common_init_iwmmxt(MpegEncContext *s)
+{
+ if (!(mm_flags & FF_MM_IWMMXT)) return;
+
+ s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt;
+#if 0
+ s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt;
+#endif
+}
diff --git a/libavcodec/arm/simple_idct_arm.S b/libavcodec/arm/simple_idct_arm.S
new file mode 100644
index 0000000000..16ade1f3da
--- /dev/null
+++ b/libavcodec/arm/simple_idct_arm.S
@@ -0,0 +1,486 @@
+/*
+ * simple_idct_arm.S
+ * Copyright (C) 2002 Frederic 'dilb' Boulay.
+ *
+ * Author: Frederic Boulay <dilb@handhelds.org>
+ *
+ * The function defined in this file is derived from the simple_idct function
+ * from the libavcodec library part of the FFmpeg project.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+/* useful constants for the algorithm, they are save in __constant_ptr__ at */
+/* the end of the source code.*/
+#define W1 22725
+#define W2 21407
+#define W3 19266
+#define W4 16383
+#define W5 12873
+#define W6 8867
+#define W7 4520
+#define MASK_MSHW 0xFFFF0000
+
+/* offsets of the constants in the vector */
+#define offW1 0
+#define offW2 4
+#define offW3 8
+#define offW4 12
+#define offW5 16
+#define offW6 20
+#define offW7 24
+#define offMASK_MSHW 28
+
+#define ROW_SHIFT 11
+#define ROW_SHIFT2MSHW (16-11)
+#define COL_SHIFT 20
+#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
+#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
+
+
+ .text
+
+function simple_idct_ARM, export=1
+ @@ void simple_idct_ARM(int16_t *block)
+ @@ save stack for reg needed (take all of them),
+ @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
+ @@ so it must not be overwritten, if it is not saved!!
+ @@ R12 is another scratch register, so it should not be saved too
+ @@ save all registers
+ stmfd sp!, {r4-r11, r14} @ R14 is also called LR
+ @@ at this point, R0=block, other registers are free.
+ add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
+ add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
+ @@ add 2 temporary variables in the stack: R0 and R14
+ sub sp, sp, #8 @ allow 2 local variables
+ str r0, [sp, #0] @ save block in sp[0]
+ @@ stack status
+ @@ sp+4 free
+ @@ sp+0 R0 (block)
+
+
+ @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
+
+
+__row_loop:
+ @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
+ ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
+ ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
+ ldr r3, [r14, #8] @ R3=ROWr32[2]
+ ldr r4, [r14, #12] @ R4=ROWr32[3]
+ @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
+ @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
+ @@ else follow the complete algorithm.
+ @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
+ @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
+ orr r5, r4, r3 @ R5=R4 | R3
+ orr r5, r5, r2 @ R5=R4 | R3 | R2
+ orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null)
+ beq __end_row_loop
+ mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
+ ldrsh r6, [r14, #0] @ R6=ROWr16[0]
+ orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7
+ beq __almost_empty_row
+
+__b_evaluation:
+ @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
+ @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
+
+ @@ MUL16(b0, W1, row[1]);
+ @@ MUL16(b1, W3, row[1]);
+ @@ MUL16(b2, W5, row[1]);
+ @@ MUL16(b3, W7, row[1]);
+ @@ MAC16(b0, W3, row[3]);
+ @@ MAC16(b1, -W7, row[3]);
+ @@ MAC16(b2, -W1, row[3]);
+ @@ MAC16(b3, -W5, row[3]);
+ ldr r8, [r12, #offW1] @ R8=W1
+ mov r2, r2, asr #16 @ R2=ROWr16[3]
+ mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldr r9, [r12, #offW3] @ R9=W3
+ ldr r10, [r12, #offW5] @ R10=W5
+ mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldr r11, [r12, #offW7] @ R11=W7
+ mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ teq r2, #0 @ if null avoid muls
+ mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ rsbne r2, r2, #0 @ R2=-ROWr16[3]
+ mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+
+ @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
+ @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
+ @@ if (temp != 0) {}
+ orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3]
+ beq __end_b_evaluation
+
+ @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
+ @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ MAC16(b0, W5, row[5]);
+ @@ MAC16(b2, W7, row[5]);
+ @@ MAC16(b3, W3, row[5]);
+ @@ MAC16(b1, -W1, row[5]);
+ @@ MAC16(b0, W7, row[7]);
+ @@ MAC16(b2, W3, row[7]);
+ @@ MAC16(b3, -W1, row[7]);
+ @@ MAC16(b1, -W5, row[7]);
+ mov r3, r3, asr #16 @ R3=ROWr16[5]
+ teq r3, #0 @ if null avoid muls
+ mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
+ mov r4, r4, asr #16 @ R4=ROWr16[7]
+ mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
+ mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
+ rsbne r3, r3, #0 @ R3=-ROWr16[5]
+ mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
+ @@ R3 is free now
+ teq r4, #0 @ if null avoid muls
+ mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
+ mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
+ rsbne r4, r4, #0 @ R4=-ROWr16[7]
+ mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
+ mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
+ @@ R4 is free now
+__end_b_evaluation:
+ @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
+ @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+
+__a_evaluation:
+ @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+ @@ a1 = a0 + W6 * row[2];
+ @@ a2 = a0 - W6 * row[2];
+ @@ a3 = a0 - W2 * row[2];
+ @@ a0 = a0 + W2 * row[2];
+ ldr r9, [r12, #offW4] @ R9=W4
+ mul r6, r9, r6 @ R6=W4*ROWr16[0]
+ ldr r10, [r12, #offW6] @ R10=W6
+ ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
+ add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
+
+ mul r11, r10, r4 @ R11=W6*ROWr16[2]
+ ldr r8, [r12, #offW2] @ R8=W2
+ sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
+ @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
+ @@ if (temp != 0) {}
+ teq r2, #0
+ beq __end_bef_a_evaluation
+
+ add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
+ mul r11, r8, r4 @ R11=W2*ROWr16[2]
+ sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
+ add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
+
+
+ @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
+ @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+
+
+ @@ a0 += W4*row[4]
+ @@ a1 -= W4*row[4]
+ @@ a2 -= W4*row[4]
+ @@ a3 += W4*row[4]
+ ldrsh r11, [r14, #8] @ R11=ROWr16[4]
+ teq r11, #0 @ if null avoid muls
+ mulne r11, r9, r11 @ R11=W4*ROWr16[4]
+ @@ R9 is free now
+ ldrsh r9, [r14, #12] @ R9=ROWr16[6]
+ addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
+ subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
+ subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
+ addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
+ @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
+ teq r9, #0 @ if null avoid muls
+ mulne r11, r10, r9 @ R11=W6*ROWr16[6]
+ addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
+ mulne r10, r8, r9 @ R10=W2*ROWr16[6]
+ @@ a0 += W6*row[6];
+ @@ a3 -= W6*row[6];
+ @@ a1 -= W2*row[6];
+ @@ a2 += W2*row[6];
+ subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
+ subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
+ addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
+
+__end_a_evaluation:
+ @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
+ @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ row[0] = (a0 + b0) >> ROW_SHIFT;
+ @@ row[1] = (a1 + b1) >> ROW_SHIFT;
+ @@ row[2] = (a2 + b2) >> ROW_SHIFT;
+ @@ row[3] = (a3 + b3) >> ROW_SHIFT;
+ @@ row[4] = (a3 - b3) >> ROW_SHIFT;
+ @@ row[5] = (a2 - b2) >> ROW_SHIFT;
+ @@ row[6] = (a1 - b1) >> ROW_SHIFT;
+ @@ row[7] = (a0 - b0) >> ROW_SHIFT;
+ add r8, r6, r0 @ R8=a0+b0
+ add r9, r2, r1 @ R9=a1+b1
+ @@ put 2 16 bits half-words in a 32bits word
+ @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
+ ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
+ and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
+ mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
+ and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
+ orr r8, r8, r9
+ str r8, [r14, #0]
+
+ add r8, r3, r5 @ R8=a2+b2
+ add r9, r4, r7 @ R9=a3+b3
+ and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
+ and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
+ orr r8, r8, r9
+ str r8, [r14, #4]
+
+ sub r8, r4, r7 @ R8=a3-b3
+ sub r9, r3, r5 @ R9=a2-b2
+ and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
+ and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
+ orr r8, r8, r9
+ str r8, [r14, #8]
+
+ sub r8, r2, r1 @ R8=a1-b1
+ sub r9, r6, r0 @ R9=a0-b0
+ and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
+ and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
+ orr r8, r8, r9
+ str r8, [r14, #12]
+
+ bal __end_row_loop
+
+__almost_empty_row:
+ @@ the row was empty, except ROWr16[0], now, management of this special case
+ @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
+ @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
+ @@ R8=0xFFFF (temp), R9-R11 free
+ mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
+ sub r8, r8, #1 @ R8 is now ready.
+ and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
+ orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16)
+ str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
+ str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
+ str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
+ str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
+
+__end_row_loop:
+ @@ at this point, R0-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ ldr r0, [sp, #0] @ R0=block
+ teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
+ sub r14, r14, #16
+ bne __row_loop
+
+
+
+ @@ at this point, R0=block, R1-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
+__col_loop:
+
+__b_evaluation2:
+ @@ at this point, R0=block (temp), R1-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ proceed with b0-b3 first, followed by a0-a3
+ @@ MUL16(b0, W1, col[8x1]);
+ @@ MUL16(b1, W3, col[8x1]);
+ @@ MUL16(b2, W5, col[8x1]);
+ @@ MUL16(b3, W7, col[8x1]);
+ @@ MAC16(b0, W3, col[8x3]);
+ @@ MAC16(b1, -W7, col[8x3]);
+ @@ MAC16(b2, -W1, col[8x3]);
+ @@ MAC16(b3, -W5, col[8x3]);
+ ldr r8, [r12, #offW1] @ R8=W1
+ ldrsh r7, [r14, #16]
+ mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldr r9, [r12, #offW3] @ R9=W3
+ ldr r10, [r12, #offW5] @ R10=W5
+ mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldr r11, [r12, #offW7] @ R11=W7
+ mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldrsh r2, [r14, #48]
+ mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ teq r2, #0 @ if 0, then avoid muls
+ mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ rsbne r2, r2, #0 @ R2=-ROWr16[3]
+ mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+
+ @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
+ @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ MAC16(b0, W5, col[5x8]);
+ @@ MAC16(b2, W7, col[5x8]);
+ @@ MAC16(b3, W3, col[5x8]);
+ @@ MAC16(b1, -W1, col[5x8]);
+ @@ MAC16(b0, W7, col[7x8]);
+ @@ MAC16(b2, W3, col[7x8]);
+ @@ MAC16(b3, -W1, col[7x8]);
+ @@ MAC16(b1, -W5, col[7x8]);
+ ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
+ teq r3, #0 @ if 0 then avoid muls
+ mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
+ mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
+ mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
+ rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
+ ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
+ mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
+ @@ R3 is free now
+ teq r4, #0 @ if 0 then avoid muls
+ mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
+ mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
+ rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
+ mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
+ mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
+ @@ R4 is free now
+__end_b_evaluation2:
+ @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
+ @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+
+__a_evaluation2:
+ @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
+ @@ a1 = a0 + W6 * row[2];
+ @@ a2 = a0 - W6 * row[2];
+ @@ a3 = a0 - W2 * row[2];
+ @@ a0 = a0 + W2 * row[2];
+ ldrsh r6, [r14, #0]
+ ldr r9, [r12, #offW4] @ R9=W4
+ mul r6, r9, r6 @ R6=W4*ROWr16[0]
+ ldr r10, [r12, #offW6] @ R10=W6
+ ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
+ add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
+ mul r11, r10, r4 @ R11=W6*ROWr16[2]
+ ldr r8, [r12, #offW2] @ R8=W2
+ add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
+ sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
+ mul r11, r8, r4 @ R11=W2*ROWr16[2]
+ sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
+ add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
+
+ @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
+ @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ a0 += W4*row[4]
+ @@ a1 -= W4*row[4]
+ @@ a2 -= W4*row[4]
+ @@ a3 += W4*row[4]
+ ldrsh r11, [r14, #64] @ R11=ROWr16[4]
+ teq r11, #0 @ if null avoid muls
+ mulne r11, r9, r11 @ R11=W4*ROWr16[4]
+ @@ R9 is free now
+ addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
+ subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
+ subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
+ ldrsh r9, [r14, #96] @ R9=ROWr16[6]
+ addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
+ @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
+ teq r9, #0 @ if null avoid muls
+ mulne r11, r10, r9 @ R11=W6*ROWr16[6]
+ addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
+ mulne r10, r8, r9 @ R10=W2*ROWr16[6]
+ @@ a0 += W6*row[6];
+ @@ a3 -= W6*row[6];
+ @@ a1 -= W2*row[6];
+ @@ a2 += W2*row[6];
+ subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
+ subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
+ addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
+__end_a_evaluation2:
+ @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
+ @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
+ @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
+ @@ col[16] = ((a2 + b2) >> COL_SHIFT);
+ @@ col[24] = ((a3 + b3) >> COL_SHIFT);
+ @@ col[32] = ((a3 - b3) >> COL_SHIFT);
+ @@ col[40] = ((a2 - b2) >> COL_SHIFT);
+ @@ col[48] = ((a1 - b1) >> COL_SHIFT);
+ @@ col[56] = ((a0 - b0) >> COL_SHIFT);
+ @@@@@ no optimization here @@@@@
+ add r8, r6, r0 @ R8=a0+b0
+ add r9, r2, r1 @ R9=a1+b1
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #0]
+ strh r9, [r14, #16]
+ add r8, r3, r5 @ R8=a2+b2
+ add r9, r4, r7 @ R9=a3+b3
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #32]
+ strh r9, [r14, #48]
+ sub r8, r4, r7 @ R8=a3-b3
+ sub r9, r3, r5 @ R9=a2-b2
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #64]
+ strh r9, [r14, #80]
+ sub r8, r2, r1 @ R8=a1-b1
+ sub r9, r6, r0 @ R9=a0-b0
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #96]
+ strh r9, [r14, #112]
+
+__end_col_loop:
+ @@ at this point, R0-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ ldr r0, [sp, #0] @ R0=block
+ teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
+ sub r14, r14, #2
+ bne __col_loop
+
+
+
+
+__end_simple_idct_ARM:
+ @@ restore registers to previous status!
+ add sp, sp, #8 @@ the local variables!
+ ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
+
+
+
+@@ kind of sub-function, here not to overload the common case.
+__end_bef_a_evaluation:
+ add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
+ mul r11, r8, r4 @ R11=W2*ROWr16[2]
+ sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
+ add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
+ bal __end_a_evaluation
+
+
+__constant_ptr__: @@ see #defines at the beginning of the source code for values.
+ .align
+ .word W1
+ .word W2
+ .word W3
+ .word W4
+ .word W5
+ .word W6
+ .word W7
+ .word MASK_MSHW
diff --git a/libavcodec/arm/simple_idct_armv5te.S b/libavcodec/arm/simple_idct_armv5te.S
new file mode 100644
index 0000000000..58040ec1ba
--- /dev/null
+++ b/libavcodec/arm/simple_idct_armv5te.S
@@ -0,0 +1,703 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define W13 (W1 | (W3 << 16))
+#define W26 (W2 | (W6 << 16))
+#define W57 (W5 | (W7 << 16))
+
+ .text
+ .align
+w13: .long W13
+w26: .long W26
+w57: .long W57
+
+function idct_row_armv5te
+ str lr, [sp, #-4]!
+
+ ldrd v1, [a1, #8]
+ ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */
+ orrs v1, v1, v2
+ cmpeq v1, a4
+ cmpeq v1, a3, lsr #16
+ beq row_dc_only
+
+ mov v1, #(1<<(ROW_SHIFT-1))
+ mov ip, #16384
+ sub ip, ip, #1 /* ip = W4 */
+ smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */
+ ldr ip, [pc, #(w26-.-8)] /* ip = W2 | (W6 << 16) */
+ smultb a2, ip, a4
+ smulbb lr, ip, a4
+ add v2, v1, a2
+ sub v3, v1, a2
+ sub v4, v1, lr
+ add v1, v1, lr
+
+ ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
+ ldr lr, [pc, #(w57-.-8)] /* lr = W5 | (W7 << 16) */
+ smulbt v5, ip, a3
+ smultt v6, lr, a4
+ smlatt v5, ip, a4, v5
+ smultt a2, ip, a3
+ smulbt v7, lr, a3
+ sub v6, v6, a2
+ smulbt a2, ip, a4
+ smultt fp, lr, a3
+ sub v7, v7, a2
+ smulbt a2, lr, a4
+ ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */
+ sub fp, fp, a2
+
+ orrs a2, a3, a4
+ beq 1f
+
+ smlabt v5, lr, a3, v5
+ smlabt v6, ip, a3, v6
+ smlatt v5, lr, a4, v5
+ smlabt v6, lr, a4, v6
+ smlatt v7, lr, a3, v7
+ smlatt fp, ip, a3, fp
+ smulbt a2, ip, a4
+ smlatt v7, ip, a4, v7
+ sub fp, fp, a2
+
+ ldr ip, [pc, #(w26-.-8)] /* ip = W2 | (W6 << 16) */
+ mov a2, #16384
+ sub a2, a2, #1 /* a2 = W4 */
+ smulbb a2, a2, a3 /* a2 = W4*row[4] */
+ smultb lr, ip, a4 /* lr = W6*row[6] */
+ add v1, v1, a2 /* v1 += W4*row[4] */
+ add v1, v1, lr /* v1 += W6*row[6] */
+ add v4, v4, a2 /* v4 += W4*row[4] */
+ sub v4, v4, lr /* v4 -= W6*row[6] */
+ smulbb lr, ip, a4 /* lr = W2*row[6] */
+ sub v2, v2, a2 /* v2 -= W4*row[4] */
+ sub v2, v2, lr /* v2 -= W2*row[6] */
+ sub v3, v3, a2 /* v3 -= W4*row[4] */
+ add v3, v3, lr /* v3 += W2*row[6] */
+
+1: add a2, v1, v5
+ mov a3, a2, lsr #11
+ bic a3, a3, #0x1f0000
+ sub a2, v2, v6
+ mov a2, a2, lsr #11
+ add a3, a3, a2, lsl #16
+ add a2, v3, v7
+ mov a4, a2, lsr #11
+ bic a4, a4, #0x1f0000
+ add a2, v4, fp
+ mov a2, a2, lsr #11
+ add a4, a4, a2, lsl #16
+ strd a3, [a1]
+
+ sub a2, v4, fp
+ mov a3, a2, lsr #11
+ bic a3, a3, #0x1f0000
+ sub a2, v3, v7
+ mov a2, a2, lsr #11
+ add a3, a3, a2, lsl #16
+ add a2, v2, v6
+ mov a4, a2, lsr #11
+ bic a4, a4, #0x1f0000
+ sub a2, v1, v5
+ mov a2, a2, lsr #11
+ add a4, a4, a2, lsl #16
+ strd a3, [a1, #8]
+
+ ldr pc, [sp], #4
+
+row_dc_only:
+ orr a3, a3, a3, lsl #16
+ bic a3, a3, #0xe000
+ mov a3, a3, lsl #3
+ mov a4, a3
+ strd a3, [a1]
+ strd a3, [a1, #8]
+
+ ldr pc, [sp], #4
+ .endfunc
+
+ .macro idct_col
+ ldr a4, [a1] /* a4 = col[1:0] */
+ mov ip, #16384
+ sub ip, ip, #1 /* ip = W4 */
+#if 0
+ mov v1, #(1<<(COL_SHIFT-1))
+ smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
+ smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
+ ldr a4, [a1, #(16*4)]
+#else
+ mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
+ add v2, v1, a4, asr #16
+ rsb v2, v2, v2, lsl #14
+ mov a4, a4, lsl #16
+ add v1, v1, a4, asr #16
+ ldr a4, [a1, #(16*4)]
+ rsb v1, v1, v1, lsl #14
+#endif
+
+ smulbb lr, ip, a4
+ smulbt a3, ip, a4
+ sub v3, v1, lr
+ sub v5, v1, lr
+ add v7, v1, lr
+ add v1, v1, lr
+ sub v4, v2, a3
+ sub v6, v2, a3
+ add fp, v2, a3
+ ldr ip, [pc, #(w26-.-8)]
+ ldr a4, [a1, #(16*2)]
+ add v2, v2, a3
+
+ smulbb lr, ip, a4
+ smultb a3, ip, a4
+ add v1, v1, lr
+ sub v7, v7, lr
+ add v3, v3, a3
+ sub v5, v5, a3
+ smulbt lr, ip, a4
+ smultt a3, ip, a4
+ add v2, v2, lr
+ sub fp, fp, lr
+ add v4, v4, a3
+ ldr a4, [a1, #(16*6)]
+ sub v6, v6, a3
+
+ smultb lr, ip, a4
+ smulbb a3, ip, a4
+ add v1, v1, lr
+ sub v7, v7, lr
+ sub v3, v3, a3
+ add v5, v5, a3
+ smultt lr, ip, a4
+ smulbt a3, ip, a4
+ add v2, v2, lr
+ sub fp, fp, lr
+ sub v4, v4, a3
+ add v6, v6, a3
+
+ stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
+
+ ldr ip, [pc, #(w13-.-8)]
+ ldr a4, [a1, #(16*1)]
+ ldr lr, [pc, #(w57-.-8)]
+ smulbb v1, ip, a4
+ smultb v3, ip, a4
+ smulbb v5, lr, a4
+ smultb v7, lr, a4
+ smulbt v2, ip, a4
+ smultt v4, ip, a4
+ smulbt v6, lr, a4
+ smultt fp, lr, a4
+ rsb v4, v4, #0
+ ldr a4, [a1, #(16*3)]
+ rsb v3, v3, #0
+
+ smlatb v1, ip, a4, v1
+ smlatb v3, lr, a4, v3
+ smulbb a3, ip, a4
+ smulbb a2, lr, a4
+ sub v5, v5, a3
+ sub v7, v7, a2
+ smlatt v2, ip, a4, v2
+ smlatt v4, lr, a4, v4
+ smulbt a3, ip, a4
+ smulbt a2, lr, a4
+ sub v6, v6, a3
+ ldr a4, [a1, #(16*5)]
+ sub fp, fp, a2
+
+ smlabb v1, lr, a4, v1
+ smlabb v3, ip, a4, v3
+ smlatb v5, lr, a4, v5
+ smlatb v7, ip, a4, v7
+ smlabt v2, lr, a4, v2
+ smlabt v4, ip, a4, v4
+ smlatt v6, lr, a4, v6
+ ldr a3, [a1, #(16*7)]
+ smlatt fp, ip, a4, fp
+
+ smlatb v1, lr, a3, v1
+ smlabb v3, lr, a3, v3
+ smlatb v5, ip, a3, v5
+ smulbb a4, ip, a3
+ smlatt v2, lr, a3, v2
+ sub v7, v7, a4
+ smlabt v4, lr, a3, v4
+ smulbt a4, ip, a3
+ smlatt v6, ip, a3, v6
+ sub fp, fp, a4
+ .endm
+
+function idct_col_armv5te
+ str lr, [sp, #-4]!
+
+ idct_col
+
+ ldmfd sp!, {a3, a4}
+ adds a2, a3, v1
+ mov a2, a2, lsr #20
+ orrmi a2, a2, #0xf000
+ add ip, a4, v2
+ mov ip, ip, asr #20
+ orr a2, a2, ip, lsl #16
+ str a2, [a1]
+ subs a3, a3, v1
+ mov a2, a3, lsr #20
+ orrmi a2, a2, #0xf000
+ sub a4, a4, v2
+ mov a4, a4, asr #20
+ orr a2, a2, a4, lsl #16
+ ldmfd sp!, {a3, a4}
+ str a2, [a1, #(16*7)]
+
+ subs a2, a3, v3
+ mov a2, a2, lsr #20
+ orrmi a2, a2, #0xf000
+ sub ip, a4, v4
+ mov ip, ip, asr #20
+ orr a2, a2, ip, lsl #16
+ str a2, [a1, #(16*1)]
+ adds a3, a3, v3
+ mov a2, a3, lsr #20
+ orrmi a2, a2, #0xf000
+ add a4, a4, v4
+ mov a4, a4, asr #20
+ orr a2, a2, a4, lsl #16
+ ldmfd sp!, {a3, a4}
+ str a2, [a1, #(16*6)]
+
+ adds a2, a3, v5
+ mov a2, a2, lsr #20
+ orrmi a2, a2, #0xf000
+ add ip, a4, v6
+ mov ip, ip, asr #20
+ orr a2, a2, ip, lsl #16
+ str a2, [a1, #(16*2)]
+ subs a3, a3, v5
+ mov a2, a3, lsr #20
+ orrmi a2, a2, #0xf000
+ sub a4, a4, v6
+ mov a4, a4, asr #20
+ orr a2, a2, a4, lsl #16
+ ldmfd sp!, {a3, a4}
+ str a2, [a1, #(16*5)]
+
+ adds a2, a3, v7
+ mov a2, a2, lsr #20
+ orrmi a2, a2, #0xf000
+ add ip, a4, fp
+ mov ip, ip, asr #20
+ orr a2, a2, ip, lsl #16
+ str a2, [a1, #(16*3)]
+ subs a3, a3, v7
+ mov a2, a3, lsr #20
+ orrmi a2, a2, #0xf000
+ sub a4, a4, fp
+ mov a4, a4, asr #20
+ orr a2, a2, a4, lsl #16
+ str a2, [a1, #(16*4)]
+
+ ldr pc, [sp], #4
+ .endfunc
+
+function idct_col_put_armv5te
+ str lr, [sp, #-4]!
+
+ idct_col
+
+ ldmfd sp!, {a3, a4}
+ ldr lr, [sp, #32]
+ add a2, a3, v1
+ movs a2, a2, asr #20
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ add ip, a4, v2
+ movs ip, ip, asr #20
+ movmi ip, #0
+ cmp ip, #255
+ movgt ip, #255
+ orr a2, a2, ip, lsl #8
+ sub a3, a3, v1
+ movs a3, a3, asr #20
+ movmi a3, #0
+ cmp a3, #255
+ movgt a3, #255
+ sub a4, a4, v2
+ movs a4, a4, asr #20
+ movmi a4, #0
+ cmp a4, #255
+ ldr v1, [sp, #28]
+ movgt a4, #255
+ strh a2, [v1]
+ add a2, v1, #2
+ str a2, [sp, #28]
+ orr a2, a3, a4, lsl #8
+ rsb v2, lr, lr, lsl #3
+ ldmfd sp!, {a3, a4}
+ strh a2, [v2, v1]!
+
+ sub a2, a3, v3
+ movs a2, a2, asr #20
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ sub ip, a4, v4
+ movs ip, ip, asr #20
+ movmi ip, #0
+ cmp ip, #255
+ movgt ip, #255
+ orr a2, a2, ip, lsl #8
+ strh a2, [v1, lr]!
+ add a3, a3, v3
+ movs a2, a3, asr #20
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ add a4, a4, v4
+ movs a4, a4, asr #20
+ movmi a4, #0
+ cmp a4, #255
+ movgt a4, #255
+ orr a2, a2, a4, lsl #8
+ ldmfd sp!, {a3, a4}
+ strh a2, [v2, -lr]!
+
+ add a2, a3, v5
+ movs a2, a2, asr #20
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ add ip, a4, v6
+ movs ip, ip, asr #20
+ movmi ip, #0
+ cmp ip, #255
+ movgt ip, #255
+ orr a2, a2, ip, lsl #8
+ strh a2, [v1, lr]!
+ sub a3, a3, v5
+ movs a2, a3, asr #20
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ sub a4, a4, v6
+ movs a4, a4, asr #20
+ movmi a4, #0
+ cmp a4, #255
+ movgt a4, #255
+ orr a2, a2, a4, lsl #8
+ ldmfd sp!, {a3, a4}
+ strh a2, [v2, -lr]!
+
+ add a2, a3, v7
+ movs a2, a2, asr #20
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ add ip, a4, fp
+ movs ip, ip, asr #20
+ movmi ip, #0
+ cmp ip, #255
+ movgt ip, #255
+ orr a2, a2, ip, lsl #8
+ strh a2, [v1, lr]
+ sub a3, a3, v7
+ movs a2, a3, asr #20
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ sub a4, a4, fp
+ movs a4, a4, asr #20
+ movmi a4, #0
+ cmp a4, #255
+ movgt a4, #255
+ orr a2, a2, a4, lsl #8
+ strh a2, [v2, -lr]
+
+ ldr pc, [sp], #4
+ .endfunc
+
+function idct_col_add_armv5te
+ str lr, [sp, #-4]!
+
+ idct_col
+
+ ldr lr, [sp, #36]
+
+ ldmfd sp!, {a3, a4}
+ ldrh ip, [lr]
+ add a2, a3, v1
+ mov a2, a2, asr #20
+ sub a3, a3, v1
+ and v1, ip, #255
+ adds a2, a2, v1
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ add v1, a4, v2
+ mov v1, v1, asr #20
+ adds v1, v1, ip, lsr #8
+ movmi v1, #0
+ cmp v1, #255
+ movgt v1, #255
+ orr a2, a2, v1, lsl #8
+ ldr v1, [sp, #32]
+ sub a4, a4, v2
+ rsb v2, v1, v1, lsl #3
+ ldrh ip, [v2, lr]!
+ strh a2, [lr]
+ mov a3, a3, asr #20
+ and a2, ip, #255
+ adds a3, a3, a2
+ movmi a3, #0
+ cmp a3, #255
+ movgt a3, #255
+ mov a4, a4, asr #20
+ adds a4, a4, ip, lsr #8
+ movmi a4, #0
+ cmp a4, #255
+ movgt a4, #255
+ add a2, lr, #2
+ str a2, [sp, #28]
+ orr a2, a3, a4, lsl #8
+ strh a2, [v2]
+
+ ldmfd sp!, {a3, a4}
+ ldrh ip, [lr, v1]!
+ sub a2, a3, v3
+ mov a2, a2, asr #20
+ add a3, a3, v3
+ and v3, ip, #255
+ adds a2, a2, v3
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ sub v3, a4, v4
+ mov v3, v3, asr #20
+ adds v3, v3, ip, lsr #8
+ movmi v3, #0
+ cmp v3, #255
+ movgt v3, #255
+ orr a2, a2, v3, lsl #8
+ add a4, a4, v4
+ ldrh ip, [v2, -v1]!
+ strh a2, [lr]
+ mov a3, a3, asr #20
+ and a2, ip, #255
+ adds a3, a3, a2
+ movmi a3, #0
+ cmp a3, #255
+ movgt a3, #255
+ mov a4, a4, asr #20
+ adds a4, a4, ip, lsr #8
+ movmi a4, #0
+ cmp a4, #255
+ movgt a4, #255
+ orr a2, a3, a4, lsl #8
+ strh a2, [v2]
+
+ ldmfd sp!, {a3, a4}
+ ldrh ip, [lr, v1]!
+ add a2, a3, v5
+ mov a2, a2, asr #20
+ sub a3, a3, v5
+ and v3, ip, #255
+ adds a2, a2, v3
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ add v3, a4, v6
+ mov v3, v3, asr #20
+ adds v3, v3, ip, lsr #8
+ movmi v3, #0
+ cmp v3, #255
+ movgt v3, #255
+ orr a2, a2, v3, lsl #8
+ sub a4, a4, v6
+ ldrh ip, [v2, -v1]!
+ strh a2, [lr]
+ mov a3, a3, asr #20
+ and a2, ip, #255
+ adds a3, a3, a2
+ movmi a3, #0
+ cmp a3, #255
+ movgt a3, #255
+ mov a4, a4, asr #20
+ adds a4, a4, ip, lsr #8
+ movmi a4, #0
+ cmp a4, #255
+ movgt a4, #255
+ orr a2, a3, a4, lsl #8
+ strh a2, [v2]
+
+ ldmfd sp!, {a3, a4}
+ ldrh ip, [lr, v1]!
+ add a2, a3, v7
+ mov a2, a2, asr #20
+ sub a3, a3, v7
+ and v3, ip, #255
+ adds a2, a2, v3
+ movmi a2, #0
+ cmp a2, #255
+ movgt a2, #255
+ add v3, a4, fp
+ mov v3, v3, asr #20
+ adds v3, v3, ip, lsr #8
+ movmi v3, #0
+ cmp v3, #255
+ movgt v3, #255
+ orr a2, a2, v3, lsl #8
+ sub a4, a4, fp
+ ldrh ip, [v2, -v1]!
+ strh a2, [lr]
+ mov a3, a3, asr #20
+ and a2, ip, #255
+ adds a3, a3, a2
+ movmi a3, #0
+ cmp a3, #255
+ movgt a3, #255
+ mov a4, a4, asr #20
+ adds a4, a4, ip, lsr #8
+ movmi a4, #0
+ cmp a4, #255
+ movgt a4, #255
+ orr a2, a3, a4, lsl #8
+ strh a2, [v2]
+
+ ldr pc, [sp], #4
+ .endfunc
+
+function simple_idct_armv5te, export=1
+ stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+
+ sub a1, a1, #(16*7)
+
+ bl idct_col_armv5te
+ add a1, a1, #4
+ bl idct_col_armv5te
+ add a1, a1, #4
+ bl idct_col_armv5te
+ add a1, a1, #4
+ bl idct_col_armv5te
+
+ ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+ .endfunc
+
+function simple_idct_add_armv5te, export=1
+ stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+ mov a1, a3
+
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+
+ sub a1, a1, #(16*7)
+
+ bl idct_col_add_armv5te
+ add a1, a1, #4
+ bl idct_col_add_armv5te
+ add a1, a1, #4
+ bl idct_col_add_armv5te
+ add a1, a1, #4
+ bl idct_col_add_armv5te
+
+ add sp, sp, #8
+ ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+ .endfunc
+
+function simple_idct_put_armv5te, export=1
+ stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+ mov a1, a3
+
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+
+ sub a1, a1, #(16*7)
+
+ bl idct_col_put_armv5te
+ add a1, a1, #4
+ bl idct_col_put_armv5te
+ add a1, a1, #4
+ bl idct_col_put_armv5te
+ add a1, a1, #4
+ bl idct_col_put_armv5te
+
+ add sp, sp, #8
+ ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+ .endfunc
diff --git a/libavcodec/arm/simple_idct_armv6.S b/libavcodec/arm/simple_idct_armv6.S
new file mode 100644
index 0000000000..4f3330d223
--- /dev/null
+++ b/libavcodec/arm/simple_idct_armv6.S
@@ -0,0 +1,433 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define W13 (W1 | (W3 << 16))
+#define W26 (W2 | (W6 << 16))
+#define W42 (W4 | (W2 << 16))
+#define W42n (-W4&0xffff | (-W2 << 16))
+#define W46 (W4 | (W6 << 16))
+#define W57 (W5 | (W7 << 16))
+
+ .text
+ .align
+w13: .long W13
+w26: .long W26
+w42: .long W42
+w42n: .long W42n
+w46: .long W46
+w57: .long W57
+
+/*
+ Compute partial IDCT of single row.
+ shift = left-shift amount
+ a1 = source address
+ a3 = row[2,0] <= 2 cycles
+ a4 = row[3,1]
+ ip = w42 <= 2 cycles
+
+ Output in registers v1--v8
+*/
+ .macro idct_row shift
+ ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */
+ mov a2, #(1<<(\shift-1))
+ smlad v1, a3, ip, a2
+ smlsd v4, a3, ip, a2
+ ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
+ ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */
+ smlad v2, a3, lr, a2
+ smlsd v3, a3, lr, a2
+
+ smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */
+ smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */
+ ldr lr, [a1, #12] /* lr = row[7,5] */
+ pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */
+ pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */
+ smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */
+ smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */
+ smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */
+
+ ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */
+ smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */
+ ldr a3, [a1, #4] /* a3 = row[6,4] */
+ smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */
+ ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */
+ smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */
+
+ smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */
+ smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */
+ smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */
+ smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */
+ .endm
+
+/*
+ Compute partial IDCT of half row.
+ shift = left-shift amount
+ a3 = row[2,0]
+ a4 = row[3,1]
+ ip = w42
+
+ Output in registers v1--v8
+*/
+ .macro idct_row4 shift
+ ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */
+ ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */
+ mov a2, #(1<<(\shift-1))
+ smlad v1, a3, ip, a2
+ smlsd v4, a3, ip, a2
+ ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */
+ smlad v2, a3, lr, a2
+ smlsd v3, a3, lr, a2
+ smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */
+ smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */
+ pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */
+ pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */
+ smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */
+ smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */
+ .endm
+
+/*
+ Compute final part of IDCT single row without shift.
+ Input in registers v1--v8
+ Output in registers ip, v1--v3, lr, v5--v7
+*/
+ .macro idct_finish
+ add ip, v1, v5 /* a2 = A0 + B0 */
+ sub lr, v1, v5 /* a3 = A0 - B0 */
+ sub v1, v2, v6 /* a3 = A1 + B1 */
+ add v5, v2, v6 /* a3 = A1 - B1 */
+ add v2, v3, v7 /* a2 = A2 + B2 */
+ sub v6, v3, v7 /* a2 = A2 - B2 */
+ add v3, v4, fp /* a3 = A3 + B3 */
+ sub v7, v4, fp /* a3 = A3 - B3 */
+ .endm
+
+/*
+ Compute final part of IDCT single row.
+ shift = right-shift amount
+ Input/output in registers v1--v8
+*/
+ .macro idct_finish_shift shift
+ add a4, v1, v5 /* a4 = A0 + B0 */
+ sub a3, v1, v5 /* a3 = A0 - B0 */
+ mov v1, a4, asr #\shift
+ mov v5, a3, asr #\shift
+
+ sub a4, v2, v6 /* a4 = A1 + B1 */
+ add a3, v2, v6 /* a3 = A1 - B1 */
+ mov v2, a4, asr #\shift
+ mov v6, a3, asr #\shift
+
+ add a4, v3, v7 /* a4 = A2 + B2 */
+ sub a3, v3, v7 /* a3 = A2 - B2 */
+ mov v3, a4, asr #\shift
+ mov v7, a3, asr #\shift
+
+ add a4, v4, fp /* a4 = A3 + B3 */
+ sub a3, v4, fp /* a3 = A3 - B3 */
+ mov v4, a4, asr #\shift
+ mov fp, a3, asr #\shift
+ .endm
+
+/*
+ Compute final part of IDCT single row, saturating results at 8 bits.
+ shift = right-shift amount
+ Input/output in registers v1--v8
+*/
+ .macro idct_finish_shift_sat shift
+ add a4, v1, v5 /* a4 = A0 + B0 */
+ sub ip, v1, v5 /* ip = A0 - B0 */
+ usat v1, #8, a4, asr #\shift
+ usat v5, #8, ip, asr #\shift
+
+ sub a4, v2, v6 /* a4 = A1 + B1 */
+ add ip, v2, v6 /* ip = A1 - B1 */
+ usat v2, #8, a4, asr #\shift
+ usat v6, #8, ip, asr #\shift
+
+ add a4, v3, v7 /* a4 = A2 + B2 */
+ sub ip, v3, v7 /* ip = A2 - B2 */
+ usat v3, #8, a4, asr #\shift
+ usat v7, #8, ip, asr #\shift
+
+ add a4, v4, fp /* a4 = A3 + B3 */
+ sub ip, v4, fp /* ip = A3 - B3 */
+ usat v4, #8, a4, asr #\shift
+ usat fp, #8, ip, asr #\shift
+ .endm
+
+/*
+ Compute IDCT of single row, storing as column.
+ a1 = source
+ a2 = dest
+*/
+function idct_row_armv6
+ str lr, [sp, #-4]!
+
+ ldr lr, [a1, #12] /* lr = row[7,5] */
+ ldr ip, [a1, #4] /* ip = row[6,4] */
+ ldr a4, [a1, #8] /* a4 = row[3,1] */
+ ldr a3, [a1] /* a3 = row[2,0] */
+ orrs lr, lr, ip
+ cmpeq lr, a4
+ cmpeq lr, a3, lsr #16
+ beq 1f
+ str a2, [sp, #-4]!
+ ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
+ cmp lr, #0
+ beq 2f
+
+ idct_row ROW_SHIFT
+ b 3f
+
+2: idct_row4 ROW_SHIFT
+
+3: ldr a2, [sp], #4
+ idct_finish_shift ROW_SHIFT
+
+ strh v1, [a2]
+ strh v2, [a2, #(16*2)]
+ strh v3, [a2, #(16*4)]
+ strh v4, [a2, #(16*6)]
+ strh fp, [a2, #(16*1)]
+ strh v7, [a2, #(16*3)]
+ strh v6, [a2, #(16*5)]
+ strh v5, [a2, #(16*7)]
+
+ ldr pc, [sp], #4
+
+1: mov a3, a3, lsl #3
+ strh a3, [a2]
+ strh a3, [a2, #(16*2)]
+ strh a3, [a2, #(16*4)]
+ strh a3, [a2, #(16*6)]
+ strh a3, [a2, #(16*1)]
+ strh a3, [a2, #(16*3)]
+ strh a3, [a2, #(16*5)]
+ strh a3, [a2, #(16*7)]
+ ldr pc, [sp], #4
+ .endfunc
+
+/*
+ Compute IDCT of single column, read as row.
+ a1 = source
+ a2 = dest
+*/
+function idct_col_armv6
+ stmfd sp!, {a2, lr}
+
+ ldr a3, [a1] /* a3 = row[2,0] */
+ ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
+ ldr a4, [a1, #8] /* a4 = row[3,1] */
+ idct_row COL_SHIFT
+ ldr a2, [sp], #4
+ idct_finish_shift COL_SHIFT
+
+ strh v1, [a2]
+ strh v2, [a2, #(16*1)]
+ strh v3, [a2, #(16*2)]
+ strh v4, [a2, #(16*3)]
+ strh fp, [a2, #(16*4)]
+ strh v7, [a2, #(16*5)]
+ strh v6, [a2, #(16*6)]
+ strh v5, [a2, #(16*7)]
+
+ ldr pc, [sp], #4
+ .endfunc
+
+/*
+ Compute IDCT of single column, read as row, store saturated 8-bit.
+ a1 = source
+ a2 = dest
+ a3 = line size
+*/
+function idct_col_put_armv6
+ stmfd sp!, {a2, a3, lr}
+
+ ldr a3, [a1] /* a3 = row[2,0] */
+ ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
+ ldr a4, [a1, #8] /* a4 = row[3,1] */
+ idct_row COL_SHIFT
+ ldmfd sp!, {a2, a3}
+ idct_finish_shift_sat COL_SHIFT
+
+ strb v1, [a2], a3
+ strb v2, [a2], a3
+ strb v3, [a2], a3
+ strb v4, [a2], a3
+ strb fp, [a2], a3
+ strb v7, [a2], a3
+ strb v6, [a2], a3
+ strb v5, [a2], a3
+
+ sub a2, a2, a3, lsl #3
+
+ ldr pc, [sp], #4
+ .endfunc
+
+/*
+ Compute IDCT of single column, read as row, add/store saturated 8-bit.
+ a1 = source
+ a2 = dest
+ a3 = line size
+*/
+function idct_col_add_armv6
+ stmfd sp!, {a2, a3, lr}
+
+ ldr a3, [a1] /* a3 = row[2,0] */
+ ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */
+ ldr a4, [a1, #8] /* a4 = row[3,1] */
+ idct_row COL_SHIFT
+ ldmfd sp!, {a2, a3}
+ idct_finish
+
+ ldrb a4, [a2]
+ ldrb v4, [a2, a3]
+ ldrb fp, [a2, a3, lsl #2]
+ add ip, a4, ip, asr #COL_SHIFT
+ usat ip, #8, ip
+ add v1, v4, v1, asr #COL_SHIFT
+ strb ip, [a2], a3
+ ldrb ip, [a2, a3]
+ usat v1, #8, v1
+ ldrb fp, [a2, a3, lsl #2]
+ add v2, ip, v2, asr #COL_SHIFT
+ usat v2, #8, v2
+ strb v1, [a2], a3
+ ldrb a4, [a2, a3]
+ ldrb ip, [a2, a3, lsl #2]
+ strb v2, [a2], a3
+ ldrb v4, [a2, a3]
+ ldrb v1, [a2, a3, lsl #2]
+ add v3, a4, v3, asr #COL_SHIFT
+ usat v3, #8, v3
+ add v7, v4, v7, asr #COL_SHIFT
+ usat v7, #8, v7
+ add v6, fp, v6, asr #COL_SHIFT
+ usat v6, #8, v6
+ add v5, ip, v5, asr #COL_SHIFT
+ usat v5, #8, v5
+ add lr, v1, lr, asr #COL_SHIFT
+ usat lr, #8, lr
+ strb v3, [a2], a3
+ strb v7, [a2], a3
+ strb v6, [a2], a3
+ strb v5, [a2], a3
+ strb lr, [a2], a3
+
+ sub a2, a2, a3, lsl #3
+
+ ldr pc, [sp], #4
+ .endfunc
+
+/*
+ Compute 8 IDCT row transforms.
+ func = IDCT row->col function
+ width = width of columns in bytes
+*/
+ .macro idct_rows func width
+ bl \func
+ add a1, a1, #(16*2)
+ add a2, a2, #\width
+ bl \func
+ add a1, a1, #(16*2)
+ add a2, a2, #\width
+ bl \func
+ add a1, a1, #(16*2)
+ add a2, a2, #\width
+ bl \func
+ sub a1, a1, #(16*5)
+ add a2, a2, #\width
+ bl \func
+ add a1, a1, #(16*2)
+ add a2, a2, #\width
+ bl \func
+ add a1, a1, #(16*2)
+ add a2, a2, #\width
+ bl \func
+ add a1, a1, #(16*2)
+ add a2, a2, #\width
+ bl \func
+
+ sub a1, a1, #(16*7)
+ .endm
+
+/* void ff_simple_idct_armv6(DCTELEM *data); */
+function ff_simple_idct_armv6, export=1
+ stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
+ sub sp, sp, #128
+
+ mov a2, sp
+ idct_rows idct_row_armv6, 2
+ mov a2, a1
+ mov a1, sp
+ idct_rows idct_col_armv6, 2
+
+ add sp, sp, #128
+ ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+ .endfunc
+
+/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
+function ff_simple_idct_add_armv6, export=1
+ stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
+ sub sp, sp, #128
+
+ mov a1, a3
+ mov a2, sp
+ idct_rows idct_row_armv6, 2
+ mov a1, sp
+ ldr a2, [sp, #128]
+ ldr a3, [sp, #(128+4)]
+ idct_rows idct_col_add_armv6, 1
+
+ add sp, sp, #(128+8)
+ ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+ .endfunc
+
+/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
+function ff_simple_idct_put_armv6, export=1
+ stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
+ sub sp, sp, #128
+
+ mov a1, a3
+ mov a2, sp
+ idct_rows idct_row_armv6, 2
+ mov a1, sp
+ ldr a2, [sp, #128]
+ ldr a3, [sp, #(128+4)]
+ idct_rows idct_col_put_armv6, 1
+
+ add sp, sp, #(128+8)
+ ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+ .endfunc
diff --git a/libavcodec/arm/simple_idct_neon.S b/libavcodec/arm/simple_idct_neon.S
new file mode 100644
index 0000000000..376db0b534
--- /dev/null
+++ b/libavcodec/arm/simple_idct_neon.S
@@ -0,0 +1,362 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4c ((1<<(COL_SHIFT-1))/W4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define w1 d0[0]
+#define w2 d0[1]
+#define w3 d0[2]
+#define w4 d0[3]
+#define w5 d1[0]
+#define w6 d1[1]
+#define w7 d1[2]
+#define w4c d1[3]
+
+ .fpu neon
+
+ .macro idct_col4_top
+ vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */
+ vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */
+ vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */
+ vadd.i32 q11, q15, q7
+ vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */
+ vadd.i32 q12, q15, q8
+ vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */
+ vsub.i32 q13, q15, q8
+ vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */
+ vsub.i32 q14, q15, q7
+
+ vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */
+ vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */
+ vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */
+ vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */
+ .endm
+
+ .text
+ .align 6
+
+function idct_row4_neon
+ vmov.i32 q15, #(1<<(ROW_SHIFT-1))
+ vld1.64 {d2-d5}, [r2,:128]!
+ vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */
+ vld1.64 {d6,d7}, [r2,:128]!
+ vorr d10, d3, d5
+ vld1.64 {d8,d9}, [r2,:128]!
+ add r2, r2, #-64
+
+ vorr d11, d7, d9
+ vorr d10, d10, d11
+ vmov r3, r4, d10
+
+ idct_col4_top
+
+ orrs r3, r3, r4
+ beq 1f
+
+ vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
+ vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
+ vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
+ vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
+ vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
+ vadd.i32 q11, q11, q7
+ vsub.i32 q12, q12, q7
+ vsub.i32 q13, q13, q7
+ vadd.i32 q14, q14, q7
+ vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
+ vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
+ vmlal.s16 q9, d9, w7
+ vmlsl.s16 q10, d9, w5
+ vmlal.s16 q5, d9, w3
+ vmlsl.s16 q6, d9, w1
+ vadd.i32 q11, q11, q7
+ vsub.i32 q12, q12, q8
+ vadd.i32 q13, q13, q8
+ vsub.i32 q14, q14, q7
+
+1: vadd.i32 q3, q11, q9
+ vadd.i32 q4, q12, q10
+ vshrn.i32 d2, q3, #ROW_SHIFT
+ vshrn.i32 d4, q4, #ROW_SHIFT
+ vadd.i32 q7, q13, q5
+ vadd.i32 q8, q14, q6
+ vtrn.16 d2, d4
+ vshrn.i32 d6, q7, #ROW_SHIFT
+ vshrn.i32 d8, q8, #ROW_SHIFT
+ vsub.i32 q14, q14, q6
+ vsub.i32 q11, q11, q9
+ vtrn.16 d6, d8
+ vsub.i32 q13, q13, q5
+ vshrn.i32 d3, q14, #ROW_SHIFT
+ vtrn.32 d2, d6
+ vsub.i32 q12, q12, q10
+ vtrn.32 d4, d8
+ vshrn.i32 d5, q13, #ROW_SHIFT
+ vshrn.i32 d7, q12, #ROW_SHIFT
+ vshrn.i32 d9, q11, #ROW_SHIFT
+
+ vtrn.16 d3, d5
+ vtrn.16 d7, d9
+ vtrn.32 d3, d7
+ vtrn.32 d5, d9
+
+ vst1.64 {d2-d5}, [r2,:128]!
+ vst1.64 {d6-d9}, [r2,:128]!
+
+ bx lr
+ .endfunc
+
+function idct_col4_neon
+ mov ip, #16
+ vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */
+ vdup.16 d30, w4c
+ vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */
+ vadd.i16 d30, d30, d2
+ vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */
+ vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
+ vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */
+
+ ldrd r4, [r2]
+ ldrd r6, [r2, #16]
+ orrs r4, r4, r5
+
+ idct_col4_top
+ addeq r2, r2, #16
+ beq 1f
+
+ vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */
+ vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
+ vadd.i32 q11, q11, q7
+ vsub.i32 q12, q12, q7
+ vsub.i32 q13, q13, q7
+ vadd.i32 q14, q14, q7
+
+1: orrs r6, r6, r7
+ ldrd r4, [r2, #16]
+ addeq r2, r2, #16
+ beq 2f
+
+ vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */
+ vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
+ vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
+ vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
+ vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
+
+2: orrs r4, r4, r5
+ ldrd r4, [r2, #16]
+ addeq r2, r2, #16
+ beq 3f
+
+ vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */
+ vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
+ vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
+ vadd.i32 q11, q11, q7
+ vsub.i32 q14, q14, q7
+ vsub.i32 q12, q12, q8
+ vadd.i32 q13, q13, q8
+
+3: orrs r4, r4, r5
+ addeq r2, r2, #16
+ beq 4f
+
+ vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */
+ vmlal.s16 q9, d9, w7
+ vmlsl.s16 q10, d9, w5
+ vmlal.s16 q5, d9, w3
+ vmlsl.s16 q6, d9, w1
+
+4: vaddhn.i32 d2, q11, q9
+ vaddhn.i32 d3, q12, q10
+ vaddhn.i32 d4, q13, q5
+ vaddhn.i32 d5, q14, q6
+ vsubhn.i32 d9, q11, q9
+ vsubhn.i32 d8, q12, q10
+ vsubhn.i32 d7, q13, q5
+ vsubhn.i32 d6, q14, q6
+
+ bx lr
+ .endfunc
+
+ .align 6
+
+function idct_col4_st8_neon
+ vqshrun.s16 d2, q1, #COL_SHIFT-16
+ vqshrun.s16 d3, q2, #COL_SHIFT-16
+ vqshrun.s16 d4, q3, #COL_SHIFT-16
+ vqshrun.s16 d5, q4, #COL_SHIFT-16
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r0,:32], r1
+ vst1.32 {d5[0]}, [r0,:32], r1
+ vst1.32 {d5[1]}, [r0,:32], r1
+
+ bx lr
+ .endfunc
+
+ .section .rodata
+ .align 4
+const: .short W1, W2, W3, W4, W5, W6, W7, W4c
+ .previous
+
+ .macro idct_start data
+ push {r4-r7, lr}
+ pld [\data]
+ pld [\data, #64]
+ vpush {d8-d15}
+ movw r3, #:lower16:const
+ movt r3, #:upper16:const
+ vld1.64 {d0,d1}, [r3,:128]
+ .endm
+
+ .macro idct_end
+ vpop {d8-d15}
+ pop {r4-r7, pc}
+ .endm
+
+/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
+function ff_simple_idct_put_neon, export=1
+ idct_start r2
+
+ bl idct_row4_neon
+ bl idct_row4_neon
+ add r2, r2, #-128
+ bl idct_col4_neon
+ bl idct_col4_st8_neon
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #4
+ add r2, r2, #-120
+ bl idct_col4_neon
+ bl idct_col4_st8_neon
+
+ idct_end
+ .endfunc
+
+ .align 6
+
+function idct_col4_add8_neon
+ mov ip, r0
+
+ vld1.32 {d10[0]}, [r0,:32], r1
+ vshr.s16 q1, q1, #COL_SHIFT-16
+ vld1.32 {d10[1]}, [r0,:32], r1
+ vshr.s16 q2, q2, #COL_SHIFT-16
+ vld1.32 {d11[0]}, [r0,:32], r1
+ vshr.s16 q3, q3, #COL_SHIFT-16
+ vld1.32 {d11[1]}, [r0,:32], r1
+ vshr.s16 q4, q4, #COL_SHIFT-16
+ vld1.32 {d12[0]}, [r0,:32], r1
+ vaddw.u8 q1, q1, d10
+ vld1.32 {d12[1]}, [r0,:32], r1
+ vaddw.u8 q2, q2, d11
+ vld1.32 {d13[0]}, [r0,:32], r1
+ vqmovun.s16 d2, q1
+ vld1.32 {d13[1]}, [r0,:32], r1
+ vaddw.u8 q3, q3, d12
+ vst1.32 {d2[0]}, [ip,:32], r1
+ vqmovun.s16 d3, q2
+ vst1.32 {d2[1]}, [ip,:32], r1
+ vaddw.u8 q4, q4, d13
+ vst1.32 {d3[0]}, [ip,:32], r1
+ vqmovun.s16 d4, q3
+ vst1.32 {d3[1]}, [ip,:32], r1
+ vqmovun.s16 d5, q4
+ vst1.32 {d4[0]}, [ip,:32], r1
+ vst1.32 {d4[1]}, [ip,:32], r1
+ vst1.32 {d5[0]}, [ip,:32], r1
+ vst1.32 {d5[1]}, [ip,:32], r1
+
+ bx lr
+ .endfunc
+
+/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
+function ff_simple_idct_add_neon, export=1
+ idct_start r2
+
+ bl idct_row4_neon
+ bl idct_row4_neon
+ add r2, r2, #-128
+ bl idct_col4_neon
+ bl idct_col4_add8_neon
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #4
+ add r2, r2, #-120
+ bl idct_col4_neon
+ bl idct_col4_add8_neon
+
+ idct_end
+ .endfunc
+
+ .align 6
+
+function idct_col4_st16_neon
+ mov ip, #16
+
+ vshr.s16 q1, q1, #COL_SHIFT-16
+ vshr.s16 q2, q2, #COL_SHIFT-16
+ vst1.64 {d2}, [r2,:64], ip
+ vshr.s16 q3, q3, #COL_SHIFT-16
+ vst1.64 {d3}, [r2,:64], ip
+ vshr.s16 q4, q4, #COL_SHIFT-16
+ vst1.64 {d4}, [r2,:64], ip
+ vst1.64 {d5}, [r2,:64], ip
+ vst1.64 {d6}, [r2,:64], ip
+ vst1.64 {d7}, [r2,:64], ip
+ vst1.64 {d8}, [r2,:64], ip
+ vst1.64 {d9}, [r2,:64], ip
+
+ bx lr
+ .endfunc
+
+/* void ff_simple_idct_neon(DCTELEM *data); */
+function ff_simple_idct_neon, export=1
+ idct_start r0
+
+ mov r2, r0
+ bl idct_row4_neon
+ bl idct_row4_neon
+ add r2, r2, #-128
+ bl idct_col4_neon
+ add r2, r2, #-128
+ bl idct_col4_st16_neon
+ add r2, r2, #-120
+ bl idct_col4_neon
+ add r2, r2, #-128
+ bl idct_col4_st16_neon
+
+ idct_end
+ .endfunc