From 709bb45c660ae7c2d065bcade931e068620f9b92 Mon Sep 17 00:00:00 2001
From: Shivraj Patil <shivraj.patil@imgtec.com>
Date: Mon, 29 Jun 2015 20:57:14 +0530
Subject: avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for me_cmp functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for me_cmp functions in new file me_cmp_msa.c

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
 libavutil/mips/generic_macros_msa.h | 59 +++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'libavutil')

diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
index b1e62b667d..d6a2573403 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -1295,6 +1295,29 @@
 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
 
+/* Description : SAD (Sum of Absolute Difference)
+   Arguments   : Inputs  - in0, in1, ref0, ref1  (unsigned byte src & ref)
+                 Outputs - sad_m                 (halfword vector with sad)
+                 Return Type - unsigned halfword
+   Details     : Absolute difference of all the byte elements from 'in0' with
+                 'ref0' is calculated and preserved in 'diff0'. From the 16
+                 unsigned absolute diff values, even-odd pairs are added
+                 together to generate 8 halfword results.
+*/
+#define SAD_UB2_UH(in0, in1, ref0, ref1)                        \
+( {                                                             \
+    v16u8 diff0_m, diff1_m;                                     \
+    v8u16 sad_m = { 0 };                                        \
+                                                                \
+    diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0);        \
+    diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1);        \
+                                                                \
+    sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m);  \
+    sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m);  \
+                                                                \
+    sad_m;                                                      \
+} )
+
 /* Description : Insert specified word elements from input vectors to 1
                  destination vector
    Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
@@ -2429,6 +2452,42 @@
 }
 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
+
+/* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3
+                 Return Type - unsigned byte
+   Details     :
+*/
+#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3)                        \
+{                                                                          \
+    v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
+                                                                           \
+    ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
+    out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
+                                                                           \
+    ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
+    out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
+                                                                           \
+    ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
+                                                                           \
+    tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
+    ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
+                                                                           \
+    tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
+    ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
+    out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
+    out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
+                                                                           \
+    tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1);            \
+    tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m);        \
+    out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
+    out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
+}
+
 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
                            in8, in9, in10, in11, in12, in13, in14, in15
-- 
cgit v1.2.3