From 271195f85bbce284ac80ed31c62fba9b7e74e99d Mon Sep 17 00:00:00 2001 From: Shivraj Patil Date: Tue, 9 Jun 2015 20:50:26 +0530 Subject: avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC loop filter and sao functions This patch adds MSA (MIPS-SIMD-Arch) optimizations for HEVC loop filter and sao functions in new file hevc_lpf_sao_msa.c Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h In this patch, in comparision with previous patch, duplicated c functions are removed. Signed-off-by: Shivraj Patil Signed-off-by: Michael Niedermayer --- libavutil/mips/generic_macros_msa.h | 111 +++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) (limited to 'libavutil') diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h index 234aead065..f43798d6bb 100644 --- a/libavutil/mips/generic_macros_msa.h +++ b/libavutil/mips/generic_macros_msa.h @@ -761,6 +761,8 @@ out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \ } #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__) +#define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__) +#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \ out0, out1, out2, out3, slide_val) \ @@ -821,6 +823,23 @@ VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ } #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) +#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) + +/* Description : Shuffle halfword vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Selective halfword elements from in0 & in1 are copied to out0 + as per control vector mask0 + Selective halfword elements from in2 & in3 are copied to out1 + as per control vector mask1 +*/ +#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \ + out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \ +} +#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) /* Description : Shuffle byte vector elements as per mask vector Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 @@ -1060,6 +1079,25 @@ #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__) +/* Description : Interleave even byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' and even byte + elements of 'in1' are interleaved and copied to 'out0' + Even byte elements of 'in2' and even byte + elements of 'in3' are interleaved and copied to 'out1' +*/ +#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ +{ \ + out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \ + out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \ +} +#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) +#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__) +#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) +#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__) + /* Description : Interleave even halfword elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 @@ -1107,6 +1145,8 @@ out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \ } #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) +#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__) +#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__) /* Description : Interleave left half of byte elements from vectors Arguments : Inputs - in0, in1, in2, in3 @@ -1212,9 +1252,22 @@ ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ } +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) +#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) + +#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ +{ \ + ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3); \ + ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \ + out4, out5, out6, out7); \ +} +#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) /* Description : Interleave right half of halfword elements from vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 @@ -1870,6 +1923,25 @@ ADD2(in4, in5, in6, in7, out2, out3); \ } +/* Description : Sign extend byte elements from input vector and return + halfword results in pair of vectors + Arguments : Inputs - in (1 input byte vector) + Outputs - out0, out1 (sign extended 2 halfword vectors) + Return Type - signed halfword + Details : Sign bit of byte elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 8 signed halfword elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 8 signed halfword elements in 'out1' +*/ +#define UNPCK_SB_SH(in, out0, out1) \ +{ \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_clti_s_b((v16i8) in, 0); \ + ILVRL_B2_SH(tmp_m, in, out0, out1); \ +} + /* Description : Zero extend unsigned byte elements to halfword elements Arguments : Inputs - in (1 input unsigned byte vector) Outputs - out0, out1 (unsigned 2 halfword vectors) @@ -1903,6 +1975,18 @@ ILVRL_H2_SW(tmp_m, in, out0, out1); \ } +/* Description : Swap two variables + Arguments : Inputs - in0, in1 + Outputs - in0, in1 (in-place) + Details : Swapping of two input variables using xor +*/ +#define SWAP(in0, in1) \ +{ \ + in0 = in0 ^ in1; \ + in1 = in0 ^ in1; \ + in0 = in0 ^ in1; \ +} + /* Description : Butterfly of 4 input vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 @@ -1959,9 +2043,34 @@ out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \ out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \ } - #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__) +#define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__) +/* Description : Transposes input 8x8 byte block + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + (input 8x8 byte block) + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + (output 8x8 byte block) + Return Type - unsigned byte + Details : +*/ +#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ +{ \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ + ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ + ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ + ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ + SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ + SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ +} +#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) +#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__) /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15 -- cgit v1.2.3