From 3a09494939ddb2f2fd0f8d015162d5174ec07d4c Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 24 Dec 2013 16:17:03 -0500 Subject: vp9mc/x86: add 16px functions (64bit only). Signed-off-by: Anton Khirnov --- libavcodec/x86/vp9dsp_init.c | 5 ++ libavcodec/x86/vp9mc.asm | 122 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) (limited to 'libavcodec') diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index 833d983ab1..dc08e60662 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -64,6 +64,9 @@ ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \ mc_funcs(4); mc_funcs(8); +#if ARCH_X86_64 +mc_funcs(16); +#endif #undef mc_funcs #undef mc_func @@ -95,7 +98,9 @@ ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \ mc_rep_func(put, sz, hsz, v, ssse3); \ mc_rep_func(avg, sz, hsz, v, ssse3) +#if ARCH_X86_32 mc_rep_funcs(16, 8); +#endif mc_rep_funcs(32, 16); mc_rep_funcs(64, 32); diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index 59e56687f2..152715c9b9 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -144,6 +144,62 @@ INIT_XMM ssse3 filter_h_fn put filter_h_fn avg +%if ARCH_X86_64 +%macro filter_hx2_fn 1 +%assign %%px mmsize +cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, src, dstride, sstride, h, filtery + mova m13, [pw_256] + mova m8, [filteryq+ 0] + mova m9, [filteryq+16] + mova m10, [filteryq+32] + mova m11, [filteryq+48] +.loop: + movu m0, [srcq-3] + movu m1, [srcq-2] + movu m2, [srcq-1] + movu m3, [srcq+0] + movu m4, [srcq+1] + movu m5, [srcq+2] + movu m6, [srcq+3] + movu m7, [srcq+4] + add srcq, sstrideq + SBUTTERFLY bw, 0, 1, 12 + SBUTTERFLY bw, 2, 3, 12 + SBUTTERFLY bw, 4, 5, 12 + SBUTTERFLY bw, 6, 7, 12 + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + paddsw m0, m4 + paddsw m1, m5 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter_hx2_fn put +filter_hx2_fn avg + +%endif ; ARCH_X86_64 + %macro filter_v_fn 1 %assign %%px mmsize/2 %if ARCH_X86_64 @@ -218,6 +274,72 @@ INIT_XMM ssse3 filter_v_fn put filter_v_fn avg +%if ARCH_X86_64 + +%macro filter_vx2_fn 1 +%assign %%px mmsize +cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, src, dstride, sstride, h, filtery, src4, sstride3 + sub srcq, sstrideq + lea sstride3q, [sstrideq*3] + sub srcq, sstrideq + mova m13, [pw_256] + sub srcq, sstrideq + mova m8, [filteryq+ 0] + lea src4q, [srcq+sstrideq*4] + mova m9, [filteryq+16] + mova m10, [filteryq+32] + mova m11, [filteryq+48] +.loop: + ; FIXME maybe reuse loads from previous rows, or just + ; more generally unroll this to prevent multiple loads of + ; the same data? + movu m0, [srcq] + movu m1, [srcq+sstrideq] + movu m2, [srcq+sstrideq*2] + movu m3, [srcq+sstride3q] + movu m4, [src4q] + movu m5, [src4q+sstrideq] + movu m6, [src4q+sstrideq*2] + movu m7, [src4q+sstride3q] + add srcq, sstrideq + add src4q, sstrideq + SBUTTERFLY bw, 0, 1, 12 + SBUTTERFLY bw, 2, 3, 12 + SBUTTERFLY bw, 4, 5, 12 + SBUTTERFLY bw, 6, 7, 12 + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m0, m2 + paddw m1, m3 + paddw m4, m6 + paddw m5, m7 + paddsw m0, m4 + paddsw m1, m5 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 +%ifidn %1, avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dstrideq + dec hd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter_vx2_fn put +filter_vx2_fn avg + +%endif ; ARCH_X86_64 + %macro fpel_fn 6 %if %2 == 4 %define %%srcfn movh -- cgit v1.2.3