summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorJames Darnley <jdarnley@obe.tv>2017-03-16 15:07:11 +0100
committerJames Darnley <jdarnley@obe.tv>2017-05-15 15:00:19 +0200
commit27460dfebc296636dec2584e0d74aaa4d48da0b9 (patch)
tree3dbbf78280ed29a9c384b15609e420bf96591c95 /libavcodec/x86
parentf61d454ca13f277b6ab7bbc9ebf7d26ce6d67ec6 (diff)
avcodec/h264: add avx 8-bit h264_idct_dc_add
Haswell: - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext Skylake-U: - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/h264_idct.asm20
-rw-r--r--libavcodec/x86/h264dsp_init.c2
2 files changed, 22 insertions, 0 deletions
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 81fe793600..01078589db 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -1158,7 +1158,27 @@ INIT_XMM avx
movd [%7+%8], %4
%endmacro
+%macro DC_ADD_INIT 1
+ add %1d, 32
+ sar %1d, 6
+ movd m0, %1d
+ pshuflw m0, m0, 0
+ lea %1, [3*stride_q]
+ pxor m1, m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+%endmacro
+
cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
movsxdifnidn stride_q, stride_d
IDCT4_ADD dst_q, block_q, stride_q
RET
+
+cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
+ movsxdifnidn stride_q, stride_d
+ movsx r3d, word [block_q]
+ mov dword [block_q], 0
+ DC_ADD_INIT r3
+ DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
+RET
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 8ba085f5e8..bf74937b3f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -35,6 +35,7 @@ IDCT_ADD_FUNC(, 8, mmx)
IDCT_ADD_FUNC(, 8, avx)
IDCT_ADD_FUNC(, 10, sse2)
IDCT_ADD_FUNC(_dc, 8, mmxext)
+IDCT_ADD_FUNC(_dc, 8, avx)
IDCT_ADD_FUNC(_dc, 10, mmxext)
IDCT_ADD_FUNC(8_dc, 8, mmxext)
IDCT_ADD_FUNC(8_dc, 10, sse2)
@@ -340,6 +341,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
}
c->h264_idct_add = ff_h264_idct_add_8_avx;
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {