From b829b4ce29185625ab8cbcf0ce7a83cf8181ac3b Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 28 Jul 2012 10:11:00 -0700 Subject: h264: convert loop filter strength dsp function to yasm. This completes the conversion of h264dsp to yasm; note that h264 also uses some dsputil functions, most notably qpel. Performance-wise, the yasm-version is ~10 cycles faster (182->172) on x86-64, and ~8 cycles faster (201->193) on x86-32. --- libavcodec/x86/h264_deblock.asm | 168 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) (limited to 'libavcodec/x86/h264_deblock.asm') diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 0891ef33da..940a8f77e1 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -27,6 +27,10 @@ %include "x86inc.asm" %include "x86util.asm" +SECTION_RODATA + +pb_3_1: times 4 db 3, 1 + SECTION .text cextern pb_0 @@ -911,3 +915,167 @@ ff_chroma_intra_body_mmx2: paddb m1, m5 paddb m2, m6 ret + +;----------------------------------------------------------------------------- +; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], +; int8_t ref[2][40], int16_t mv[2][40][2], +; int bidir, int edges, int step, +; int mask_mv0, int mask_mv1, int field); +; +; bidir is 0 or 1 +; edges is 1 or 4 +; step is 1 or 2 +; mask_mv0 is 0 or 3 +; mask_mv1 is 0 or 1 +; field is 0 or 1 +;----------------------------------------------------------------------------- +%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, + ; dir, d_idx, mask_dir, bidir +%define edgesd %1 +%define stepd %2 +%define mask_mvd %3 +%define dir %4 +%define d_idx %5 +%define mask_dir %6 +%define bidir %7 + xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step) +%%.b_idx_loop: +%if mask_dir == 0 + pxor m0, m0 +%endif + test b_idxd, dword mask_mvd + jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv)) +%if bidir == 1 + movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } + punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } + pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } + pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } + pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } + psubb m0, m2 ; { ref0[b] != ref0[bn], + ; ref0[b] != ref1[bn] } + psubb m1, m3 ; { ref1[b] != ref1[bn], + ; ref1[b] != ref0[bn] } + + por m0, m1 + mova m1, [mvq+b_idxq*4+(d_idx+12)*4] + mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] + mova m3, m1 + mova m4, m2 + psubw m1, [mvq+b_idxq*4+12*4] + psubw m2, [mvq+b_idxq*4+12*4+mmsize] + psubw m3, [mvq+b_idxq*4+52*4] + psubw m4, [mvq+b_idxq*4+52*4+mmsize] + packsswb m1, m2 + packsswb m3, m4 + paddb m1, m6 + paddb m3, m6 + psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit + psubusb m3, m5 + packsswb m1, m3 + + por m0, m1 + mova m1, [mvq+b_idxq*4+(d_idx+52)*4] + mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] + mova m3, m1 + mova m4, m2 + psubw m1, [mvq+b_idxq*4+12*4] + psubw m2, [mvq+b_idxq*4+12*4+mmsize] + psubw m3, [mvq+b_idxq*4+52*4] + psubw m4, [mvq+b_idxq*4+52*4+mmsize] + packsswb m1, m2 + packsswb m3, m4 + paddb m1, m6 + paddb m3, m6 + psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit + psubusb m3, m5 + packsswb m1, m3 + + pshufw m1, m1, 0x4E + por m0, m1 + pshufw m1, m0, 0x4E + pminub m0, m1 +%else ; bidir == 0 + movd m0, [refq+b_idxq+12] + psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] + + mova m1, [mvq+b_idxq*4+12*4] + mova m2, [mvq+b_idxq*4+12*4+mmsize] + psubw m1, [mvq+b_idxq*4+(d_idx+12)*4] + psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] + packsswb m1, m2 + paddb m1, m6 + psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit + packsswb m1, m1 + por m0, m1 +%endif ; bidir == 1/0 + +%%.skip_loop_iter: + movd m1, [nnzq+b_idxq+12] + por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] + + pminub m1, m7 + pminub m0, m7 + psllw m1, 1 + pxor m2, m2 + pmaxub m1, m0 + punpcklbw m1, m2 + movq [bsq+b_idxq+32*dir], m1 + + add b_idxd, dword stepd + cmp b_idxd, dword edgesd + jl %%.b_idx_loop +%endmacro + +INIT_MMX mmx2 +cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \ + step, mask_mv0, mask_mv1, field +%define b_idxq bidirq +%define b_idxd bidird + cmp dword fieldm, 0 + mova m7, [pb_1] + mova m5, [pb_3] + je .nofield + mova m5, [pb_3_1] +.nofield: + mova m6, m5 + paddb m5, m5 + + shl dword stepd, 3 + shl dword edgesd, 3 +%if ARCH_X86_32 +%define mask_mv0d mask_mv0m +%define mask_mv1d mask_mv1m +%endif + shl dword mask_mv1d, 3 + shl dword mask_mv0d, 3 + + cmp dword bidird, 0 + jne .bidir + loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0 + loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0 + + mova m0, [bsq+mmsize*0] + mova m1, [bsq+mmsize*1] + mova m2, [bsq+mmsize*2] + mova m3, [bsq+mmsize*3] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + mova [bsq+mmsize*0], m0 + mova [bsq+mmsize*1], m1 + mova [bsq+mmsize*2], m2 + mova [bsq+mmsize*3], m3 + RET + +.bidir: + loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1 + loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1 + + mova m0, [bsq+mmsize*0] + mova m1, [bsq+mmsize*1] + mova m2, [bsq+mmsize*2] + mova m3, [bsq+mmsize*3] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + mova [bsq+mmsize*0], m0 + mova [bsq+mmsize*1], m1 + mova [bsq+mmsize*2], m2 + mova [bsq+mmsize*3], m3 + RET -- cgit v1.2.3