summaryrefslogtreecommitdiff
path: root/libavcodec
diff options
context:
space:
mode:
authorKaustubh Raste <kaustubh.raste@imgtec.com>2017-07-24 18:11:53 +0530
committerMichael Niedermayer <michael@niedermayer.cc>2017-07-25 22:04:34 +0200
commita776cb2074484f4c07ddcdc03398879d9587edcd (patch)
tree2f292693d65982d41d1fb58ab7bac50e1c7aaad6 /libavcodec
parent0563a5d175428943c741be2d18ac6010b868fc91 (diff)
libavcodec/mips: Optimize avc idct 4x4 for msa
Removed memset call and improved performance. Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com> Reviewed-by: Manojkumar Bhosale <Manojkumar.Bhosale@imgtec.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/mips/h264idct_msa.c104
1 files changed, 56 insertions, 48 deletions
diff --git a/libavcodec/mips/h264idct_msa.c b/libavcodec/mips/h264idct_msa.c
index fac1e7add4..81e09e9b16 100644
--- a/libavcodec/mips/h264idct_msa.c
+++ b/libavcodec/mips/h264idct_msa.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
+ * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
*
* This file is part of FFmpeg.
*
@@ -36,48 +36,6 @@
BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3); \
}
-static void avc_idct4x4_addblk_msa(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
-{
- v8i16 src0, src1, src2, src3;
- v8i16 hres0, hres1, hres2, hres3;
- v8i16 vres0, vres1, vres2, vres3;
- v8i16 zeros = { 0 };
-
- LD4x4_SH(src, src0, src1, src2, src3);
- AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
- TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
- AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
- SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
- ADDBLK_ST4x4_UB(vres0, vres1, vres2, vres3, dst, dst_stride);
- ST_SH2(zeros, zeros, src, 8);
-}
-
-static void avc_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
-{
- int16_t dc;
- uint32_t src0, src1, src2, src3;
- v16u8 pred = { 0 };
- v16i8 out;
- v8i16 input_dc, pred_r, pred_l;
-
- dc = (src[0] + 32) >> 6;
- input_dc = __msa_fill_h(dc);
- src[0] = 0;
-
- LW4(dst, dst_stride, src0, src1, src2, src3);
- INSERT_W4_UB(src0, src1, src2, src3, pred);
- UNPCK_UB_SH(pred, pred_r, pred_l);
-
- pred_r += input_dc;
- pred_l += input_dc;
-
- CLIP_SH2_0_255(pred_r, pred_l);
- out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
int32_t de_q_val)
{
@@ -317,11 +275,45 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
ST8x4_UB(dst2, dst3, dst, dst_stride);
}
-void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src,
- int32_t dst_stride)
+void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
{
- avc_idct4x4_addblk_msa(dst, src, dst_stride);
- memset(src, 0, 16 * sizeof(dctcoef));
+ uint32_t src0_m, src1_m, src2_m, src3_m, out0_m, out1_m, out2_m, out3_m;
+ v16i8 dst0_m = { 0 };
+ v16i8 dst1_m = { 0 };
+ v8i16 hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
+ v8i16 inp0_m, inp1_m, res0_m, res1_m, src1, src3;
+ const v8i16 src0 = LD_SH(src);
+ const v8i16 src2 = LD_SH(src + 8);
+ const v8i16 zero = { 0 };
+ const uint8_t *dst1 = dst + dst_stride;
+ const uint8_t *dst2 = dst + 2 * dst_stride;
+ const uint8_t *dst3 = dst + 3 * dst_stride;
+
+ ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
+ ST_SH2(zero, zero, src, 8);
+ AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
+ TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
+ AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
+ src0_m = LW(dst);
+ src1_m = LW(dst1);
+ SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
+ src2_m = LW(dst2);
+ src3_m = LW(dst3);
+ ILVR_D2_SH(vres1, vres0, vres3, vres2, inp0_m, inp1_m);
+ INSERT_W2_SB(src0_m, src1_m, dst0_m);
+ INSERT_W2_SB(src2_m, src3_m, dst1_m);
+ ILVR_B2_SH(zero, dst0_m, zero, dst1_m, res0_m, res1_m);
+ ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);
+ CLIP_SH2_0_255(res0_m, res1_m);
+ PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);
+ out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);
+ out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);
+ out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);
+ out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);
+ SW(out0_m, dst);
+ SW(out1_m, dst1);
+ SW(out2_m, dst2);
+ SW(out3_m, dst3);
}
void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
@@ -334,7 +326,23 @@ void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
int32_t dst_stride)
{
- avc_idct4x4_addblk_dc_msa(dst, src, dst_stride);
+ v16u8 pred = { 0 };
+ v16i8 out;
+ v8i16 pred_r, pred_l;
+ const uint32_t src0 = LW(dst);
+ const uint32_t src1 = LW(dst + dst_stride);
+ const uint32_t src2 = LW(dst + 2 * dst_stride);
+ const uint32_t src3 = LW(dst + 3 * dst_stride);
+ const int16_t dc = (src[0] + 32) >> 6;
+ const v8i16 input_dc = __msa_fill_h(dc);
+
+ src[0] = 0;
+ INSERT_W4_UB(src0, src1, src2, src3, pred);
+ UNPCK_UB_SH(pred, pred_r, pred_l);
+ ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
+ CLIP_SH2_0_255(pred_r, pred_l);
+ out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
+ ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
}
void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,