summaryrefslogtreecommitdiff
path: root/libavcodec/mips/hpeldsp_msa.c
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/mips/hpeldsp_msa.c')
-rw-r--r--libavcodec/mips/hpeldsp_msa.c66
1 files changed, 42 insertions, 24 deletions
diff --git a/libavcodec/mips/hpeldsp_msa.c b/libavcodec/mips/hpeldsp_msa.c
index ad92f8f115..2bbe4771d4 100644
--- a/libavcodec/mips/hpeldsp_msa.c
+++ b/libavcodec/mips/hpeldsp_msa.c
@@ -59,12 +59,13 @@ static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
uint8_t loop_cnt;
uint32_t out0, out1;
v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
+ v16i8 zeros = { 0 };
for (loop_cnt = (height >> 1); loop_cnt--;) {
LD_UB2(src, src_stride, src0, src1);
src += (2 * src_stride);
- SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
+ SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
out0 = __msa_copy_u_w((v4i32) res0, 0);
@@ -82,13 +83,14 @@ static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
{
uint8_t loop_cnt;
v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+ v16i8 zeros = { 0 };
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
- SLDI_B4_0_SB(src0, src1, src2, src3,
- src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+ SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
+ src0_sld1, src1_sld1, src2_sld1, src3_sld1);
AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
dst += (4 * dst_stride);
@@ -125,14 +127,15 @@ static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
+ v16i8 zeros = { 0 };
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
- SLDI_B4_0_SB(src0, src1, src2, src3,
- src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
- SLDI_B4_0_SB(src4, src5, src6, src7,
- src4_sld1, src5_sld1, src6_sld1, src7_sld1, 1);
+ SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
+ src0_sld1, src1_sld1, src2_sld1, src3_sld1);
+ SLDI_B4_SB(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1,
+ src4_sld1, src5_sld1, src6_sld1, src7_sld1);
AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
@@ -145,10 +148,11 @@ static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride)
{
v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+ v16i8 zeros = { 0 };
LD_SB4(src, src_stride, src0, src1, src2, src3);
- SLDI_B4_0_SB(src0, src1, src2, src3,
- src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+ SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
+ src0_sld1, src1_sld1, src2_sld1, src3_sld1);
AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
}
@@ -216,12 +220,13 @@ static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
v16u8 tmp0 = { 0 };
v16u8 tmp1 = { 0 };
+ v16i8 zeros = { 0 };
for (loop_cnt = (height >> 1); loop_cnt--;) {
LD_UB2(src, src_stride, src0, src1);
src += (2 * src_stride);
- SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
+ SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
dst0 = LW(dst);
dst1 = LW(dst + dst_stride);
@@ -247,13 +252,14 @@ static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
{
uint8_t loop_cnt;
v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
+ v16i8 zeros = { 0 };
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
- SLDI_B4_0_SB(src0, src1, src2, src3,
- src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
+ SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
+ src0_sld1, src1_sld1, src2_sld1, src3_sld1);
AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
src3, src3_sld1, dst, dst_stride);
@@ -529,6 +535,7 @@ static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
v16u8 src0_r, src1_r, src2_r, res;
v8u16 add0, add1, add2, sum0, sum1;
+ v16i8 zeros = { 0 };
src0 = LD_SB(src);
src += src_stride;
@@ -537,7 +544,8 @@ static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
LD_SB2(src, src_stride, src1, src2);
src += (2 * src_stride);
- SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+ SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
+ src1_sld1, src2_sld1);
ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
src0_r, src1_r, src2_r);
HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
@@ -565,6 +573,7 @@ static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
v8u16 add0, add1, add2, add3, add4;
v8u16 sum0, sum1, sum2, sum3;
+ v16i8 zeros = { 0 };
src0 = LD_SB(src);
src += src_stride;
@@ -573,8 +582,9 @@ static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
LD_SB4(src, src_stride, src1, src2, src3, src4);
src += (4 * src_stride);
- SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
- SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+ SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
+ src1_sld1, src2_sld1);
+ SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
src1_r, src2_r);
ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
@@ -659,15 +669,17 @@ static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
v16i8 out0, out1;
+ v16i8 zeros = { 0 };
LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
src8 = LD_UB(src);
- SLDI_B4_0_UB(src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1,
- src3_sld1, 1);
- SLDI_B3_0_UB(src4, src5, src6, src4_sld1, src5_sld1, src6_sld1, 1);
- SLDI_B2_0_UB(src7, src8, src7_sld1, src8_sld1, 1);
+ SLDI_B4_UB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
+ src0_sld1, src1_sld1, src2_sld1, src3_sld1);
+ SLDI_B3_UB(zeros, src4, zeros, src5, zeros, src6, 1, src4_sld1,
+ src5_sld1, src6_sld1);
+ SLDI_B2_UB(zeros, src7, zeros, src8, 1, src7_sld1, src8_sld1);
ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
src3, src0_r, src1_r, src2_r, src3_r);
ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
@@ -703,13 +715,15 @@ static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
v8u16 add0, add1, add2, add3, add4;
v8u16 sum0, sum1, sum2, sum3;
v16i8 out0, out1;
+ v16i8 zeros = { 0 };
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
src4 = LD_SB(src);
- SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
- SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+ SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
+ src1_sld1, src2_sld1);
+ SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
src1_r, src2_r);
ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
@@ -918,6 +932,7 @@ static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
v16u8 src0_r, src1_r, src2_r;
v8u16 add0, add1, add2, sum0, sum1;
v16u8 dst0, dst1, res0, res1;
+ v16i8 zeros = { 0 };
src0 = LD_SB(src);
src += src_stride;
@@ -927,7 +942,8 @@ static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
src += (2 * src_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
- SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
+ SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
+ src1_sld1, src2_sld1);
ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
src1_r, src2_r);
HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
@@ -959,6 +975,7 @@ static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
v8u16 add0, add1, add2, add3, add4;
v8u16 sum0, sum1, sum2, sum3;
+ v16i8 zeros = { 0 };
src0 = LD_SB(src);
src += src_stride;
@@ -968,8 +985,9 @@ static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
src += (4 * src_stride);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
- SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
+ SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
+ src1_sld1, src2_sld1);
+ SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
src1_r, src2_r);
ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);