summaryrefslogtreecommitdiff
path: root/libavcodec
diff options
context:
space:
mode:
authorJames Darnley <james.darnley@gmail.com>2016-01-13 14:34:44 +0000
committerJames Darnley <james.darnley@gmail.com>2016-02-05 17:26:04 +0100
commit7042a55c55d7ac04e9a6b3cd150c85d64cedb79f (patch)
treeee004806b6743c2f3609b7972b71caf980b4b363 /libavcodec
parent9556446623d6b1e6a1a1369e27fd3a04b3f4afa0 (diff)
avcodec/h264: mmxext 4:2:2 chroma deblock/loop filter
2.6 times faster (366 vs. 142 cycles)
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/x86/h264_deblock.asm46
-rw-r--r--libavcodec/x86/h264dsp_init.c4
2 files changed, 47 insertions, 3 deletions
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 5151f3c9cd..8f80863fc5 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -864,7 +864,50 @@ ff_chroma_inter_body_mmxext:
DEBLOCK_P0_Q0
ret
+%define t5 r4
+%define t6 r5
+
+cglobal deblock_h_chroma422_8, 5, 6, 0, 0-(1+ARCH_X86_64*2)*mmsize
+ %if ARCH_X86_64
+ %define buf0 [rsp+16]
+ %define buf1 [rsp+8]
+ %else
+ %define buf0 r0m
+ %define buf1 r2m
+ %endif
+
+ movd m6, [r4]
+ punpcklbw m6, m6
+ movq [rsp], m6
+ CHROMA_H_START
+
+ TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+ movq buf0, m0
+ movq buf1, m3
+ LOAD_MASK r2d, r3d
+ movd m6, [rsp]
+ punpcklwd m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ movq m0, buf0
+ movq m3, buf1
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+ lea r0, [r0+r1*8]
+ lea t5, [t5+r1*8]
+
+ TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
+ movq buf0, m0
+ movq buf1, m3
+ LOAD_MASK r2d, r3d
+ movd m6, [rsp+4]
+ punpcklwd m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ movq m0, buf0
+ movq m3, buf1
+ TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
+RET
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
@@ -877,9 +920,6 @@ ff_chroma_inter_body_mmxext:
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%endmacro
-%define t5 r4
-%define t6 r5
-
;------------------------------------------------------------------------------
; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
;------------------------------------------------------------------------------
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 35db20014a..c8cd0650c7 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -129,6 +129,8 @@ LF_IFUNC(v, chroma_intra, depth, avx)
LF_FUNCS(uint8_t, 8)
LF_FUNCS(uint16_t, 10)
+void ff_deblock_h_chroma422_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
+
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
LF_FUNC(v8, luma, 8, mmxext)
static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
@@ -245,6 +247,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
+ } else {
+ c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
}
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;