From 709bb45c660ae7c2d065bcade931e068620f9b92 Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Mon, 29 Jun 2015 20:57:14 +0530 Subject: avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for me_cmp functions This patch adds MSA (MIPS-SIMD-Arch) optimizations for me_cmp functions in new file me_cmp_msa.c Signed-off-by: Shivraj Patil Signed-off-by: Michael Niedermayer --- libavutil/mips/generic_macros_msa.h | 59 +++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'libavutil') diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h index b1e62b667d..d6a2573403 100644 --- a/libavutil/mips/generic_macros_msa.h +++ b/libavutil/mips/generic_macros_msa.h @@ -1295,6 +1295,29 @@ #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__) #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__) +/* Description : SAD (Sum of Absolute Difference) + Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref) + Outputs - sad_m (halfword vector with sad) + Return Type - unsigned halfword + Details : Absolute difference of all the byte elements from 'in0' with + 'ref0' is calculated and preserved in 'diff0'. From the 16 + unsigned absolute diff values, even-odd pairs are added + together to generate 8 halfword results. +*/ +#define SAD_UB2_UH(in0, in1, ref0, ref1) \ +( { \ + v16u8 diff0_m, diff1_m; \ + v8u16 sad_m = { 0 }; \ + \ + diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \ + diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \ + \ + sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \ + sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \ + \ + sad_m; \ +} ) + /* Description : Insert specified word elements from input vectors to 1 destination vector Arguments : Inputs - in0, in1, in2, in3 (4 input vectors) @@ -2429,6 +2452,42 @@ } #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__) + +/* Description : Transposes 16x4 block into 4x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3 + Return Type - unsigned byte + Details : +*/ +#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3) \ +{ \ + v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \ + out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \ + \ + ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \ + out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \ + \ + ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \ + \ + tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ + ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \ + \ + tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \ + ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \ + out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ + out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ + \ + tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \ + tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \ + out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ + out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \ +} + /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15 -- cgit v1.2.3