summaryrefslogtreecommitdiff
path: root/libavcodec/armv4l/dsputil_iwmmxt.c
diff options
context:
space:
mode:
authorBernhard Rosenkränzer <bero@arklinux.org>2005-05-26 14:32:46 +0000
committerMichael Niedermayer <michaelni@gmx.at>2005-05-26 14:32:46 +0000
commit6ad1fa5a49320c101a62d24aa0e7df14c10d7612 (patch)
tree0bb61f2a5ea730c7c70e9c96f17c47eb6163dd08 /libavcodec/armv4l/dsputil_iwmmxt.c
parentc66a443401bbf38a3c79070d499fcb5601c95cea (diff)
Better ARM support for mplayer/ffmpeg, ported from atty fork
while playing with some new hardware, I found it's running a forked mplayer -- and it looks like they're following the GPL. The maintainer's page is here: http://atty.jp/?Zaurus/mplayer Unfortunately it's mostly in Japanese, so it's hard to figure out any details. Their code looks quite interesting (at least to those of us w/ ARM CPUs). The patches I've attached are the patches from atty.jp with a couple of modifications by myself: - ported to current CVS - reverted their change of removing SNOW support from ffmpeg - cleaned up their bswap mess - removed DOS-style linebreaks from various files patch by (Bernhard Rosenkraenzer: bero, arklinux org) Originally committed as revision 4311 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/armv4l/dsputil_iwmmxt.c')
-rw-r--r--libavcodec/armv4l/dsputil_iwmmxt.c168
1 files changed, 168 insertions, 0 deletions
diff --git a/libavcodec/armv4l/dsputil_iwmmxt.c b/libavcodec/armv4l/dsputil_iwmmxt.c
new file mode 100644
index 0000000000..6e2465d7c8
--- /dev/null
+++ b/libavcodec/armv4l/dsputil_iwmmxt.c
@@ -0,0 +1,168 @@
+/*
+ * iWMMXt optimized DSP utils
+ * Copyright (c) 2004 AGAWA Koji
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "../dsputil.h"
+
+#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
+#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
+#define WAVG2B "wavg2b"
+#include "dsputil_iwmmxt_rnd.h"
+#undef DEF
+#undef SET_RND
+#undef WAVG2B
+
+#define DEF(x, y) x ## _ ## y ##_iwmmxt
+#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
+#define WAVG2B "wavg2br"
+#include "dsputil_iwmmxt_rnd.h"
+#undef DEF
+#undef SET_RND
+#undef WAVG2BR
+
+// need scheduling
+#define OP(AVG) \
+ asm volatile ( \
+ /* alignment */ \
+ "and r12, %[pixels], #7 \n\t" \
+ "bic %[pixels], %[pixels], #7 \n\t" \
+ "tmcr wcgr1, r12 \n\t" \
+ \
+ "wldrd wr0, [%[pixels]] \n\t" \
+ "wldrd wr1, [%[pixels], #8] \n\t" \
+ "add %[pixels], %[pixels], %[line_size] \n\t" \
+ "walignr1 wr4, wr0, wr1 \n\t" \
+ \
+ "1: \n\t" \
+ \
+ "wldrd wr2, [%[pixels]] \n\t" \
+ "wldrd wr3, [%[pixels], #8] \n\t" \
+ "add %[pixels], %[pixels], %[line_size] \n\t" \
+ "pld [%[pixels]] \n\t" \
+ "walignr1 wr5, wr2, wr3 \n\t" \
+ AVG " wr6, wr4, wr5 \n\t" \
+ "wstrd wr6, [%[block]] \n\t" \
+ "add %[block], %[block], %[line_size] \n\t" \
+ \
+ "wldrd wr0, [%[pixels]] \n\t" \
+ "wldrd wr1, [%[pixels], #8] \n\t" \
+ "add %[pixels], %[pixels], %[line_size] \n\t" \
+ "walignr1 wr4, wr0, wr1 \n\t" \
+ "pld [%[pixels]] \n\t" \
+ AVG " wr6, wr4, wr5 \n\t" \
+ "wstrd wr6, [%[block]] \n\t" \
+ "add %[block], %[block], %[line_size] \n\t" \
+ \
+ "subs %[h], %[h], #2 \n\t" \
+ "bne 1b \n\t" \
+ : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \
+ : [line_size]"r"(line_size) \
+ : "memory", "r12");
+void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ OP("wavg2br");
+}
+void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
+{
+ OP("wavg2b");
+}
+#undef OP
+
+void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+ uint8_t *pixels2 = pixels + line_size;
+
+ __asm__ __volatile__ (
+ "mov r12, #4 \n\t"
+ "1: \n\t"
+ "pld [%[pixels], %[line_size2]] \n\t"
+ "pld [%[pixels2], %[line_size2]] \n\t"
+ "wldrd wr4, [%[pixels]] \n\t"
+ "wldrd wr5, [%[pixels2]] \n\t"
+ "pld [%[block], #32] \n\t"
+ "wunpckelub wr6, wr4 \n\t"
+ "wldrd wr0, [%[block]] \n\t"
+ "wunpckehub wr7, wr4 \n\t"
+ "wldrd wr1, [%[block], #8] \n\t"
+ "wunpckelub wr8, wr5 \n\t"
+ "wldrd wr2, [%[block], #16] \n\t"
+ "wunpckehub wr9, wr5 \n\t"
+ "wldrd wr3, [%[block], #24] \n\t"
+ "add %[block], %[block], #32 \n\t"
+ "waddhss wr10, wr0, wr6 \n\t"
+ "waddhss wr11, wr1, wr7 \n\t"
+ "waddhss wr12, wr2, wr8 \n\t"
+ "waddhss wr13, wr3, wr9 \n\t"
+ "wpackhus wr14, wr10, wr11 \n\t"
+ "wpackhus wr15, wr12, wr13 \n\t"
+ "wstrd wr14, [%[pixels]] \n\t"
+ "add %[pixels], %[pixels], %[line_size2] \n\t"
+ "subs r12, r12, #1 \n\t"
+ "wstrd wr15, [%[pixels2]] \n\t"
+ "add %[pixels2], %[pixels2], %[line_size2] \n\t"
+ "bne 1b \n\t"
+ : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
+ : [line_size2]"r"(line_size << 1)
+ : "cc", "memory", "r12");
+}
+
+static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+ return;
+}
+
+void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
+{
+ c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
+
+ c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
+ c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
+ c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
+ c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
+ c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
+ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
+ c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
+ c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
+
+ c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
+ c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
+ c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
+ c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
+ c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
+ c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
+ c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
+ c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
+
+ c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
+ c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
+ c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
+ c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
+ c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
+
+ c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
+ c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
+ c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
+ c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
+ c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
+ c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
+}