summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShivraj Patil <shivraj.patil@imgtec.com>2015-06-29 20:57:14 +0530
committerMichael Niedermayer <michaelni@gmx.at>2015-07-06 18:25:14 +0200
commit709bb45c660ae7c2d065bcade931e068620f9b92 (patch)
tree4d6b5bb2ae122529ce93cbeffa9d78be3f56d444
parent2f3f98af2b3215b7f3ab302275a0b3b4acaf84a5 (diff)
avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for me_cmp functions
This patch adds MSA (MIPS-SIMD-Arch) optimizations for me_cmp functions in new file me_cmp_msa.c Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavcodec/me_cmp.c2
-rw-r--r--libavcodec/me_cmp.h1
-rw-r--r--libavcodec/mips/Makefile2
-rw-r--r--libavcodec/mips/me_cmp_init_mips.c56
-rw-r--r--libavcodec/mips/me_cmp_mips.h60
-rw-r--r--libavcodec/mips/me_cmp_msa.c686
-rw-r--r--libavutil/mips/generic_macros_msa.h59
7 files changed, 866 insertions, 0 deletions
diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c
index d4213d2759..dc76b07ba2 100644
--- a/libavcodec/me_cmp.c
+++ b/libavcodec/me_cmp.c
@@ -991,4 +991,6 @@ av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
ff_me_cmp_init_ppc(c, avctx);
if (ARCH_X86)
ff_me_cmp_init_x86(c, avctx);
+ if (ARCH_MIPS)
+ ff_me_cmp_init_mips(c, avctx);
}
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index 98ee53ce2a..a3603ec2c1 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -87,6 +87,7 @@ void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx);
void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type);
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 59c1f7947a..29938912f8 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -31,6 +31,7 @@ OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_init_mips.o
OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_init_mips.o
OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_init_mips.o
OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoencdsp_init_mips.o
+OBJS-$(CONFIG_ME_CMP) += mips/me_cmp_init_mips.o
MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
mips/hevc_mc_uni_msa.o \
mips/hevc_mc_uniw_msa.o \
@@ -51,5 +52,6 @@ MSA-OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_msa.o
MSA-OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_msa.o
MSA-OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_msa.o
MSA-OBJS-$(CONFIG_MPEGVIDEOENC) += mips/mpegvideoencdsp_msa.o
+MSA-OBJS-$(CONFIG_ME_CMP) += mips/me_cmp_msa.o
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o
LOONGSON3-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o
diff --git a/libavcodec/mips/me_cmp_init_mips.c b/libavcodec/mips/me_cmp_init_mips.c
new file mode 100644
index 0000000000..219a0dc00c
--- /dev/null
+++ b/libavcodec/mips/me_cmp_init_mips.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "me_cmp_mips.h"
+
+#if HAVE_MSA
+static av_cold void me_cmp_msa(MECmpContext *c, AVCodecContext *avctx)
+{
+#if BIT_DEPTH == 8
+ c->pix_abs[0][0] = ff_pix_abs16_msa;
+ c->pix_abs[0][1] = ff_pix_abs16_x2_msa;
+ c->pix_abs[0][2] = ff_pix_abs16_y2_msa;
+ c->pix_abs[0][3] = ff_pix_abs16_xy2_msa;
+ c->pix_abs[1][0] = ff_pix_abs8_msa;
+ c->pix_abs[1][1] = ff_pix_abs8_x2_msa;
+ c->pix_abs[1][2] = ff_pix_abs8_y2_msa;
+ c->pix_abs[1][3] = ff_pix_abs8_xy2_msa;
+
+ c->hadamard8_diff[0] = ff_hadamard8_diff16_msa;
+ c->hadamard8_diff[1] = ff_hadamard8_diff8x8_msa;
+
+ c->hadamard8_diff[4] = ff_hadamard8_intra16_msa;
+ c->hadamard8_diff[5] = ff_hadamard8_intra8x8_msa;
+
+ c->sad[0] = ff_pix_abs16_msa;
+ c->sad[1] = ff_pix_abs8_msa;
+ c->sse[0] = ff_sse16_msa;
+ c->sse[1] = ff_sse8_msa;
+ c->sse[2] = ff_sse4_msa;
+#endif
+}
+#endif // #if HAVE_MSA
+
+av_cold void ff_me_cmp_init_mips(MECmpContext *c, AVCodecContext *avctx)
+{
+#if HAVE_MSA
+ me_cmp_msa(c, avctx);
+#endif // #if HAVE_MSA
+}
diff --git a/libavcodec/mips/me_cmp_mips.h b/libavcodec/mips/me_cmp_mips.h
new file mode 100644
index 0000000000..e0d0f51af8
--- /dev/null
+++ b/libavcodec/mips/me_cmp_mips.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MIPS_ME_CMP_MIPS_H
+#define AVCODEC_MIPS_ME_CMP_MIPS_H
+
+#include "../mpegvideo.h"
+#include "libavcodec/bit_depth_template.c"
+
+int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int h);
+int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int h);
+int ff_hadamard8_diff16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int h);
+int ff_hadamard8_intra16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int h);
+int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h);
+int ff_sse16_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+ ptrdiff_t stride, int i32Height);
+int ff_sse8_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+ ptrdiff_t stride, int i32Height);
+int ff_sse4_msa(MpegEncContext *v, uint8_t *pu8Src, uint8_t *pu8Ref,
+ ptrdiff_t stride, int i32Height);
+void ff_add_pixels8_msa(uint8_t *av_restrict pixels, int16_t *block,
+ ptrdiff_t stride);
+
+#endif // #ifndef AVCODEC_MIPS_ME_CMP_MIPS_H
diff --git a/libavcodec/mips/me_cmp_msa.c b/libavcodec/mips/me_cmp_msa.c
new file mode 100644
index 0000000000..0e3165cd8f
--- /dev/null
+++ b/libavcodec/mips/me_cmp_msa.c
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mips/generic_macros_msa.h"
+#include "me_cmp_mips.h"
+
+static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *ref, int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+ src0, src1, ref0, ref1);
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *ref, int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ v16u8 src0, src1, ref0, ref1;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+ LD_UB2(ref, ref_stride, ref0, ref1);
+ ref += (2 * ref_stride);
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+ LD_UB2(src, src_stride, src0, src1);
+ src += (2 * src_stride);
+ LD_UB2(ref, ref_stride, ref0, ref1);
+ ref += (2 * ref_stride);
+ sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+ }
+
+ return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
+ int32_t src_stride,
+ uint8_t *ref,
+ int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, comp0, comp1;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 3); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+
+ PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+
+ PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
+ SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+ SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+ PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+ AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ }
+
+ return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,
+ int32_t src_stride,
+ uint8_t *ref,
+ int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, comp0, comp1;
+ v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 3); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
+ LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
+ ref += (4 * ref_stride);
+
+ AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
+ sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
+ LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
+ ref += (4 * ref_stride);
+
+ AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
+ sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+ }
+
+ return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,
+ int32_t src_stride,
+ uint8_t *ref,
+ int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, comp0, comp1;
+ v16u8 ref0, ref1, ref2, ref3, ref4;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 3); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
+ ref += (4 * ref_stride);
+
+ PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+ PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
+ PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
+ AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
+ ref += (4 * ref_stride);
+
+ PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+ PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
+ PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
+ AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ }
+
+ return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,
+ int32_t src_stride,
+ uint8_t *ref,
+ int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, comp0, comp1;
+ v16u8 ref0, ref1, ref2, ref3, ref4;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 3); ht_cnt--;) {
+ LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
+ ref += (5 * ref_stride);
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
+ sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+ ref4 = ref3;
+
+ LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+ ref += (3 * ref_stride);
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
+ sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+ AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
+ sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+ }
+
+ return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,
+ int32_t src_stride,
+ uint8_t *ref,
+ int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, temp0, temp1, diff;
+ v16u8 ref0, ref1, ref2, ref3, ref4;
+ v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+ v8u16 comp0, comp1, comp2, comp3;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
+ ref += (4 * ref_stride);
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+
+ PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+
+ VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
+ comp0 = __msa_hadd_u_h(temp0, temp0);
+ comp1 = __msa_hadd_u_h(temp1, temp1);
+ comp0 += comp1;
+ comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
+ comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
+
+ temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
+ comp2 = __msa_hadd_u_h(temp0, temp0);
+ comp1 += comp2;
+ comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
+ comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
+ comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
+ diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
+ sad += __msa_hadd_u_h(diff, diff);
+
+ temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
+ comp3 = __msa_hadd_u_h(temp1, temp1);
+ comp2 += comp3;
+ comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
+ comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
+
+ temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
+ comp0 = __msa_hadd_u_h(temp0, temp0);
+ comp3 += comp0;
+ comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
+ comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
+ comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
+ diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
+ sad += __msa_hadd_u_h(diff, diff);
+ }
+
+ return (HADD_UH_U32(sad));
+}
+
+static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,
+ int32_t src_stride,
+ uint8_t *ref,
+ int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ v16u8 src0, src1, src2, src3, comp, diff;
+ v16u8 temp0, temp1, temp2, temp3;
+ v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
+ v8u16 comp0, comp1, comp2, comp3;
+ v8u16 sad = { 0 };
+
+ for (ht_cnt = (height >> 3); ht_cnt--;) {
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
+ LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
+ ref += (5 * ref_stride);
+
+ ILVRL_B2_UB(ref14, ref04, temp0, temp1);
+ comp0 = __msa_hadd_u_h(temp0, temp0);
+ comp1 = __msa_hadd_u_h(temp1, temp1);
+ ILVRL_B2_UB(ref10, ref00, temp2, temp3);
+ comp2 = __msa_hadd_u_h(temp2, temp2);
+ comp3 = __msa_hadd_u_h(temp3, temp3);
+ comp0 += comp2;
+ comp1 += comp3;
+ SRARI_H2_UH(comp0, comp1, 2);
+ comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+ diff = __msa_asub_u_b(src0, comp);
+ sad += __msa_hadd_u_h(diff, diff);
+
+ ILVRL_B2_UB(ref11, ref01, temp0, temp1);
+ comp0 = __msa_hadd_u_h(temp0, temp0);
+ comp1 = __msa_hadd_u_h(temp1, temp1);
+ comp2 += comp0;
+ comp3 += comp1;
+ SRARI_H2_UH(comp2, comp3, 2);
+ comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+ diff = __msa_asub_u_b(src1, comp);
+ sad += __msa_hadd_u_h(diff, diff);
+
+ ILVRL_B2_UB(ref12, ref02, temp2, temp3);
+ comp2 = __msa_hadd_u_h(temp2, temp2);
+ comp3 = __msa_hadd_u_h(temp3, temp3);
+ comp0 += comp2;
+ comp1 += comp3;
+ SRARI_H2_UH(comp0, comp1, 2);
+ comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+ diff = __msa_asub_u_b(src2, comp);
+ sad += __msa_hadd_u_h(diff, diff);
+
+ ILVRL_B2_UB(ref13, ref03, temp0, temp1);
+ comp0 = __msa_hadd_u_h(temp0, temp0);
+ comp1 = __msa_hadd_u_h(temp1, temp1);
+ comp2 += comp0;
+ comp3 += comp1;
+ SRARI_H2_UH(comp2, comp3, 2);
+ comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+ diff = __msa_asub_u_b(src3, comp);
+ sad += __msa_hadd_u_h(diff, diff);
+
+ LD_UB4(src, src_stride, src0, src1, src2, src3);
+ src += (4 * src_stride);
+ LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
+ LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
+ ref += (3 * ref_stride);
+
+ ILVRL_B2_UB(ref10, ref00, temp2, temp3);
+ comp2 = __msa_hadd_u_h(temp2, temp2);
+ comp3 = __msa_hadd_u_h(temp3, temp3);
+ comp0 += comp2;
+ comp1 += comp3;
+ SRARI_H2_UH(comp0, comp1, 2);
+ comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+ diff = __msa_asub_u_b(src0, comp);
+ sad += __msa_hadd_u_h(diff, diff);
+
+ ILVRL_B2_UB(ref11, ref01, temp0, temp1);
+ comp0 = __msa_hadd_u_h(temp0, temp0);
+ comp1 = __msa_hadd_u_h(temp1, temp1);
+ comp2 += comp0;
+ comp3 += comp1;
+ SRARI_H2_UH(comp2, comp3, 2);
+ comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+ diff = __msa_asub_u_b(src1, comp);
+ sad += __msa_hadd_u_h(diff, diff);
+
+ ILVRL_B2_UB(ref12, ref02, temp2, temp3);
+ comp2 = __msa_hadd_u_h(temp2, temp2);
+ comp3 = __msa_hadd_u_h(temp3, temp3);
+ comp0 += comp2;
+ comp1 += comp3;
+ SRARI_H2_UH(comp0, comp1, 2);
+ comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
+ diff = __msa_asub_u_b(src2, comp);
+ sad += __msa_hadd_u_h(diff, diff);
+
+ ILVRL_B2_UB(ref13, ref03, temp0, temp1);
+ comp0 = __msa_hadd_u_h(temp0, temp0);
+ comp1 = __msa_hadd_u_h(temp1, temp1);
+ comp2 += comp0;
+ comp3 += comp1;
+ SRARI_H2_UH(comp2, comp3, 2);
+ comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
+ diff = __msa_asub_u_b(src3, comp);
+ sad += __msa_hadd_u_h(diff, diff);
+ }
+
+ return (HADD_UH_U32(sad));
+}
+
+#define CALC_MSE_B(src, ref, var) \
+{ \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
+ HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
+ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+}
+
+static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ uint32_t sse;
+ uint32_t src0, src1, src2, src3;
+ uint32_t ref0, ref1, ref2, ref3;
+ v16u8 src = { 0 };
+ v16u8 ref = { 0 };
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LW4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ INSERT_W4_UB(src0, src1, src2, src3, src);
+ INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+ CALC_MSE_B(src, ref, var);
+ }
+
+ sse = HADD_SW_S32(var);
+
+ return sse;
+}
+
+static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ uint32_t sse;
+ v16u8 src0, src1, src2, src3;
+ v16u8 ref0, ref1, ref2, ref3;
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+ ref_ptr += (4 * ref_stride);
+
+ PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+ src0, src1, ref0, ref1);
+ CALC_MSE_B(src0, ref0, var);
+ CALC_MSE_B(src1, ref1, var);
+ }
+
+ sse = HADD_SW_S32(var);
+
+ return sse;
+}
+
+static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *ref_ptr, int32_t ref_stride,
+ int32_t height)
+{
+ int32_t ht_cnt;
+ uint32_t sse;
+ v16u8 src, ref;
+ v4i32 var = { 0 };
+
+ for (ht_cnt = (height >> 2); ht_cnt--;) {
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+
+ src = LD_UB(src_ptr);
+ src_ptr += src_stride;
+ ref = LD_UB(ref_ptr);
+ ref_ptr += ref_stride;
+ CALC_MSE_B(src, ref, var);
+ }
+
+ sse = HADD_SW_S32(var);
+
+ return sse;
+}
+
+static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *ref, int32_t ref_stride)
+{
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+ v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ v8i16 sum = { 0 };
+ v8i16 zero = { 0 };
+
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
+ src4, ref4, src5, ref5, src6, ref6, src7, ref7,
+ diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+ HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
+ HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
+ TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
+ diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+ BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
+ temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
+ BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
+ diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
+ BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
+ temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
+ TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
+ temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
+ BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
+ diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
+ BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
+ temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
+ ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
+ diff0, diff1, diff2, diff3);
+ sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
+ sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
+ sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
+ sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
+ sum += __msa_add_a_h((v8i16) diff0, zero);
+ sum += __msa_add_a_h((v8i16) diff1, zero);
+ sum += __msa_add_a_h((v8i16) diff2, zero);
+ sum += __msa_add_a_h((v8i16) diff3, zero);
+
+ return (HADD_UH_U32(sum));
+}
+
+static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,
+ uint8_t *ref, int32_t ref_stride)
+{
+ int32_t sum_res = 0;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+ v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ v8i16 sum = { 0 };
+ v16i8 zero = { 0 };
+
+ LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
+ src0, src1, src2, src3, src4, src5, src6, src7);
+ ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
+ zero, src4, zero, src5, zero, src6, zero, src7,
+ diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
+ BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
+ temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
+ BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
+ diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
+ BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
+ temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
+ TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
+ temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
+ BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
+ diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
+ BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
+ temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
+ ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
+ diff0, diff1, diff2, diff3);
+ sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
+ sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
+ sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
+ sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
+ sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
+ sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
+ sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
+ sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
+ sum_res = (HADD_UH_U32(sum));
+ sum_res -= abs(temp0[0] + temp4[0]);
+
+ return sum_res;
+}
+
+int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+ ptrdiff_t stride, int height)
+{
+ return sad_16width_msa(src, stride, ref, stride, height);
+}
+
+int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+ ptrdiff_t stride, int height)
+{
+ return sad_8width_msa(src, stride, ref, stride, height);
+}
+
+int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h)
+{
+ return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h)
+{
+ return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h)
+{
+ return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h)
+{
+ return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h)
+{
+ return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
+ ptrdiff_t stride, int h)
+{
+ return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
+}
+
+int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+ ptrdiff_t stride, int height)
+{
+ return sse_16width_msa(src, stride, ref, stride, height);
+}
+
+int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+ ptrdiff_t stride, int height)
+{
+ return sse_8width_msa(src, stride, ref, stride, height);
+}
+
+int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
+ ptrdiff_t stride, int height)
+{
+ return sse_4width_msa(src, stride, ref, stride, height);
+}
+
+int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int h)
+{
+ return hadamard_diff_8x8_msa(src, stride, dst, stride);
+}
+
+int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
+ ptrdiff_t stride, int h)
+{
+ return hadamard_intra_8x8_msa(src, stride, dst, stride);
+}
+
+/* Hadamard Transform functions */
+#define WRAPPER8_16_SQ(name8, name16) \
+int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
+ ptrdiff_t stride, int h) \
+{ \
+ int score = 0; \
+ score += name8(s, dst, src, stride, 8); \
+ score += name8(s, dst + 8, src + 8, stride, 8); \
+ if(h == 16) { \
+ dst += 8 * stride; \
+ src += 8 * stride; \
+ score +=name8(s, dst, src, stride, 8); \
+ score +=name8(s, dst + 8, src + 8, stride, 8); \
+ } \
+ return score; \
+}
+
+WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);
+WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);
diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
index b1e62b667d..d6a2573403 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -1295,6 +1295,29 @@
#define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
#define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
+/* Description : SAD (Sum of Absolute Difference)
+ Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
+ Outputs - sad_m (halfword vector with sad)
+ Return Type - unsigned halfword
+ Details : Absolute difference of all the byte elements from 'in0' with
+ 'ref0' is calculated and preserved in 'diff0'. From the 16
+ unsigned absolute diff values, even-odd pairs are added
+ together to generate 8 halfword results.
+*/
+#define SAD_UB2_UH(in0, in1, ref0, ref1) \
+( { \
+ v16u8 diff0_m, diff1_m; \
+ v8u16 sad_m = { 0 }; \
+ \
+ diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
+ diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
+ \
+ sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
+ sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
+ \
+ sad_m; \
+} )
+
/* Description : Insert specified word elements from input vectors to 1
destination vector
Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
@@ -2429,6 +2452,42 @@
}
#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
+
+/* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
+ Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15
+ Outputs - out0, out1, out2, out3
+ Return Type - unsigned byte
+ Details :
+*/
+#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3) \
+{ \
+ v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
+ out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
+ \
+ ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
+ out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
+ \
+ ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
+ \
+ tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
+ ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
+ \
+ tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
+ ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
+ out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
+ out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
+ \
+ tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
+ tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
+ out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
+ out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
+}
+
/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
in8, in9, in10, in11, in12, in13, in14, in15