From ee3ef5fda2f11cb5bf555d4f49698eb5dcde6ee1 Mon Sep 17 00:00:00 2001
From: Shivraj Patil <shivraj.patil@imgtec.com>
Date: Sun, 14 Jun 2015 23:26:24 +0530
Subject: avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for hpel functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for hpel functions in new file hpeldsp_msa.c
Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
 libavutil/mips/generic_macros_msa.h | 162 ++++++++++++++++++++++++++++++++++++
 1 file changed, 162 insertions(+)

(limited to 'libavutil')

diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
index e6e11e86de..272a42316f 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -804,6 +804,15 @@
 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
 
+#define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2,  slide_val)     \
+{                                                                         \
+    v16i8 zero_m = { 0 };                                                 \
+    SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);                    \
+    out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val);  \
+}
+#define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
+#define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
+
 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
                   out0, out1, out2, out3, slide_val)    \
 {                                                       \
@@ -1174,6 +1183,13 @@
 }
 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
 
+#define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \
+{                                                             \
+    HADD_UB2(RTYPE, in0, in1, out0, out1);                    \
+    out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \
+}
+#define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
+
 /* Description : Horizontal subtraction of unsigned byte vector elements
    Arguments   : Inputs  - in0, in1
                  Outputs - out0, out1
@@ -2408,6 +2424,67 @@
     out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
 }
 
+/* Description : Average byte elements from pair of vectors and store 8x4 byte
+                 block in destination memory
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
+                 Outputs -
+                 Return Type -
+   Details     : Each byte element from input vector pair 'in0' and 'in1' are
+                 averaged (a + b)/2 and stored in 'tmp0_m'
+                 Each byte element from input vector pair 'in2' and 'in3' are
+                 averaged (a + b)/2 and stored in 'tmp1_m'
+                 Each byte element from input vector pair 'in4' and 'in5' are
+                 averaged (a + b)/2 and stored in 'tmp2_m'
+                 Each byte element from input vector pair 'in6' and 'in7' are
+                 averaged (a + b)/2 and stored in 'tmp3_m'
+                 The half vector results from all 4 vectors are stored in
+                 destination memory as 8x4 byte block
+*/
+#define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
+{                                                                           \
+    uint64_t out0_m, out1_m, out2_m, out3_m;                                \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+                                                                            \
+    tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \
+    tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \
+    tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \
+    tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \
+                                                                            \
+    out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \
+    out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \
+    out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \
+    out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \
+    SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \
+}
+
+/* Description : Average byte elements from pair of vectors and store 16x4 byte
+                 block in destination memory
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
+                 Outputs -
+                 Return Type -
+   Details     : Each byte element from input vector pair 'in0' and 'in1' are
+                 averaged (a + b)/2 and stored in 'tmp0_m'
+                 Each byte element from input vector pair 'in2' and 'in3' are
+                 averaged (a + b)/2 and stored in 'tmp1_m'
+                 Each byte element from input vector pair 'in4' and 'in5' are
+                 averaged (a + b)/2 and stored in 'tmp2_m'
+                 Each byte element from input vector pair 'in6' and 'in7' are
+                 averaged (a + b)/2 and stored in 'tmp3_m'
+                 The results from all 4 vectors are stored in destination
+                 memory as 16x4 byte block
+*/
+#define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
+{                                                                            \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+                                                                             \
+    tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \
+    tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \
+    tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \
+    tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \
+                                                                             \
+    ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \
+}
+
 /* Description : Average rounded byte elements from pair of vectors and store
                  8x4 byte block in destination memory
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
@@ -2439,6 +2516,91 @@
     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \
 }
 
+/* Description : Average rounded byte elements from pair of vectors and store
+                 16x4 byte block in destination memory
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
+                 Outputs -
+                 Return Type -
+   Details     : Each byte element from input vector pair 'in0' and 'in1' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
+                 Each byte element from input vector pair 'in2' and 'in3' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
+                 Each byte element from input vector pair 'in4' and 'in5' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
+                 Each byte element from input vector pair 'in6' and 'in7' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
+                 The vector results from all 4 vectors are stored in
+                 destination memory as 16x4 byte block
+*/
+#define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
+{                                                                             \
+    v16u8 t0_m, t1_m, t2_m, t3_m;                                             \
+                                                                              \
+    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \
+                t0_m, t1_m, t2_m, t3_m);                                      \
+    ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \
+}
+
+/* Description : Average rounded byte elements from pair of vectors,
+                 average rounded with destination and store 8x4 byte block
+                 in destination memory
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
+                 Outputs -
+                 Return Type -
+   Details     : Each byte element from input vector pair 'in0' and 'in1' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
+                 Each byte element from input vector pair 'in2' and 'in3' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
+                 Each byte element from input vector pair 'in4' and 'in5' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
+                 Each byte element from input vector pair 'in6' and 'in7' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
+                 The half vector results from all 4 vectors are stored in
+                 destination memory as 8x4 byte block
+*/
+#define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
+                          pdst, stride)                            \
+{                                                                  \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \
+    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \
+                                                                   \
+    LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \
+    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
+    AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
+                  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
+}
+
+/* Description : Average rounded byte elements from pair of vectors,
+                 average rounded with destination and store 16x4 byte block
+                 in destination memory
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
+                 Outputs -
+                 Return Type -
+   Details     : Each byte element from input vector pair 'in0' and 'in1' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp0_m'
+                 Each byte element from input vector pair 'in2' and 'in3' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp1_m'
+                 Each byte element from input vector pair 'in4' and 'in5' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp2_m'
+                 Each byte element from input vector pair 'in6' and 'in7' are
+                 average rounded (a + b + 1)/2 and stored in 'tmp3_m'
+                 The vector results from all 4 vectors are stored in
+                 destination memory as 16x4 byte block
+*/
+#define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
+                           pdst, stride)                            \
+{                                                                   \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
+    v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \
+                                                                    \
+    LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \
+    AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \
+    AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
+                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
+}
+
 /* Description : Add block 4x4
    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
                  Outputs -
-- 
cgit v1.2.3