From b829b4ce29185625ab8cbcf0ce7a83cf8181ac3b Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 28 Jul 2012 10:11:00 -0700 Subject: h264: convert loop filter strength dsp function to yasm. This completes the conversion of h264dsp to yasm; note that h264 also uses some dsputil functions, most notably qpel. Performance-wise, the yasm-version is ~10 cycles faster (182->172) on x86-64, and ~8 cycles faster (201->193) on x86-32. --- libavcodec/x86/h264_deblock.asm | 168 ++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/h264dsp_mmx.c | 162 ++------------------------------------ 2 files changed, 175 insertions(+), 155 deletions(-) (limited to 'libavcodec') diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 0891ef33da..940a8f77e1 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -27,6 +27,10 @@ %include "x86inc.asm" %include "x86util.asm" +SECTION_RODATA + +pb_3_1: times 4 db 3, 1 + SECTION .text cextern pb_0 @@ -911,3 +915,167 @@ ff_chroma_intra_body_mmx2: paddb m1, m5 paddb m2, m6 ret + +;----------------------------------------------------------------------------- +; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], +; int8_t ref[2][40], int16_t mv[2][40][2], +; int bidir, int edges, int step, +; int mask_mv0, int mask_mv1, int field); +; +; bidir is 0 or 1 +; edges is 1 or 4 +; step is 1 or 2 +; mask_mv0 is 0 or 3 +; mask_mv1 is 0 or 1 +; field is 0 or 1 +;----------------------------------------------------------------------------- +%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, + ; dir, d_idx, mask_dir, bidir +%define edgesd %1 +%define stepd %2 +%define mask_mvd %3 +%define dir %4 +%define d_idx %5 +%define mask_dir %6 +%define bidir %7 + xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step) +%%.b_idx_loop: +%if mask_dir == 0 + pxor m0, m0 +%endif + test b_idxd, dword mask_mvd + jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv)) +%if bidir == 1 + movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } + punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } + pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } + pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } + pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } + psubb m0, m2 ; { ref0[b] != ref0[bn], + ; ref0[b] != ref1[bn] } + psubb m1, m3 ; { ref1[b] != ref1[bn], + ; ref1[b] != ref0[bn] } + + por m0, m1 + mova m1, [mvq+b_idxq*4+(d_idx+12)*4] + mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] + mova m3, m1 + mova m4, m2 + psubw m1, [mvq+b_idxq*4+12*4] + psubw m2, [mvq+b_idxq*4+12*4+mmsize] + psubw m3, [mvq+b_idxq*4+52*4] + psubw m4, [mvq+b_idxq*4+52*4+mmsize] + packsswb m1, m2 + packsswb m3, m4 + paddb m1, m6 + paddb m3, m6 + psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit + psubusb m3, m5 + packsswb m1, m3 + + por m0, m1 + mova m1, [mvq+b_idxq*4+(d_idx+52)*4] + mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] + mova m3, m1 + mova m4, m2 + psubw m1, [mvq+b_idxq*4+12*4] + psubw m2, [mvq+b_idxq*4+12*4+mmsize] + psubw m3, [mvq+b_idxq*4+52*4] + psubw m4, [mvq+b_idxq*4+52*4+mmsize] + packsswb m1, m2 + packsswb m3, m4 + paddb m1, m6 + paddb m3, m6 + psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit + psubusb m3, m5 + packsswb m1, m3 + + pshufw m1, m1, 0x4E + por m0, m1 + pshufw m1, m0, 0x4E + pminub m0, m1 +%else ; bidir == 0 + movd m0, [refq+b_idxq+12] + psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] + + mova m1, [mvq+b_idxq*4+12*4] + mova m2, [mvq+b_idxq*4+12*4+mmsize] + psubw m1, [mvq+b_idxq*4+(d_idx+12)*4] + psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] + packsswb m1, m2 + paddb m1, m6 + psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit + packsswb m1, m1 + por m0, m1 +%endif ; bidir == 1/0 + +%%.skip_loop_iter: + movd m1, [nnzq+b_idxq+12] + por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] + + pminub m1, m7 + pminub m0, m7 + psllw m1, 1 + pxor m2, m2 + pmaxub m1, m0 + punpcklbw m1, m2 + movq [bsq+b_idxq+32*dir], m1 + + add b_idxd, dword stepd + cmp b_idxd, dword edgesd + jl %%.b_idx_loop +%endmacro + +INIT_MMX mmx2 +cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \ + step, mask_mv0, mask_mv1, field +%define b_idxq bidirq +%define b_idxd bidird + cmp dword fieldm, 0 + mova m7, [pb_1] + mova m5, [pb_3] + je .nofield + mova m5, [pb_3_1] +.nofield: + mova m6, m5 + paddb m5, m5 + + shl dword stepd, 3 + shl dword edgesd, 3 +%if ARCH_X86_32 +%define mask_mv0d mask_mv0m +%define mask_mv1d mask_mv1m +%endif + shl dword mask_mv1d, 3 + shl dword mask_mv0d, 3 + + cmp dword bidird, 0 + jne .bidir + loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0 + loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0 + + mova m0, [bsq+mmsize*0] + mova m1, [bsq+mmsize*1] + mova m2, [bsq+mmsize*2] + mova m3, [bsq+mmsize*3] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + mova [bsq+mmsize*0], m0 + mova [bsq+mmsize*1], m1 + mova [bsq+mmsize*2], m2 + mova [bsq+mmsize*3], m3 + RET + +.bidir: + loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1 + loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1 + + mova m0, [bsq+mmsize*0] + mova m1, [bsq+mmsize*1] + mova m2, [bsq+mmsize*2] + mova m3, [bsq+mmsize*3] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + mova [bsq+mmsize*0], m0 + mova [bsq+mmsize*1], m1 + mova [bsq+mmsize*2], m2 + mova [bsq+mmsize*3], m3 + RET diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index 3f18f64f4b..5d9da993a6 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -88,158 +88,10 @@ void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul /***********************************/ /* deblocking */ -#define h264_loop_filter_strength_iteration_mmx2(bS, nz, ref, mv, bidir, edges, step, mask_mv, dir, d_idx, mask_dir) \ - do { \ - x86_reg b_idx; \ - mask_mv <<= 3; \ - for( b_idx=0; b_idx= limit */ \ - "psubusb %%mm5, %%mm3 \n" \ - "packsswb %%mm3, %%mm1 \n" \ - \ - "por %%mm1, %%mm0 \n" \ - "movq %a7(%1,%2,4), %%mm1 \n" \ - "movq %a8(%1,%2,4), %%mm2 \n" \ - "movq %%mm1, %%mm3 \n" \ - "movq %%mm2, %%mm4 \n" \ - "psubw 48(%1,%2,4), %%mm1 \n" \ - "psubw 56(%1,%2,4), %%mm2 \n" \ - "psubw 208(%1,%2,4), %%mm3 \n" \ - "psubw 216(%1,%2,4), %%mm4 \n" \ - "packsswb %%mm2, %%mm1 \n" \ - "packsswb %%mm4, %%mm3 \n" \ - "paddb %%mm6, %%mm1 \n" \ - "paddb %%mm6, %%mm3 \n" \ - "psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \ - "psubusb %%mm5, %%mm3 \n" \ - "packsswb %%mm3, %%mm1 \n" \ - \ - "pshufw $0x4E, %%mm1, %%mm1 \n" \ - "por %%mm1, %%mm0 \n" \ - "pshufw $0x4E, %%mm0, %%mm1 \n" \ - "pminub %%mm1, %%mm0 \n" \ - ::"r"(ref), \ - "r"(mv), \ - "r"(b_idx), \ - "i"(d_idx+12), \ - "i"(d_idx+52), \ - "i"(d_idx*4+48), \ - "i"(d_idx*4+56), \ - "i"(d_idx*4+208), \ - "i"(d_idx*4+216) \ - ); \ - } else { \ - __asm__ volatile( \ - "movd 12(%0,%2), %%mm0 \n" \ - "psubb %a3(%0,%2), %%mm0 \n" /* ref[b] != ref[bn] */ \ - "movq 48(%1,%2,4), %%mm1 \n" \ - "movq 56(%1,%2,4), %%mm2 \n" \ - "psubw %a4(%1,%2,4), %%mm1 \n" \ - "psubw %a5(%1,%2,4), %%mm2 \n" \ - "packsswb %%mm2, %%mm1 \n" \ - "paddb %%mm6, %%mm1 \n" \ - "psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \ - "packsswb %%mm1, %%mm1 \n" \ - "por %%mm1, %%mm0 \n" \ - ::"r"(ref), \ - "r"(mv), \ - "r"(b_idx), \ - "i"(d_idx+12), \ - "i"(d_idx*4+48), \ - "i"(d_idx*4+56) \ - ); \ - } \ - } \ - __asm__ volatile( \ - "movd 12(%0,%1), %%mm1 \n" \ - "por %a2(%0,%1), %%mm1 \n" /* nnz[b] || nnz[bn] */ \ - ::"r"(nnz), \ - "r"(b_idx), \ - "i"(d_idx+12) \ - ); \ - __asm__ volatile( \ - "pminub %%mm7, %%mm1 \n" \ - "pminub %%mm7, %%mm0 \n" \ - "psllw $1, %%mm1 \n" \ - "pxor %%mm2, %%mm2 \n" \ - "pmaxub %%mm0, %%mm1 \n" \ - "punpcklbw %%mm2, %%mm1 \n" \ - "movq %%mm1, %a1(%0,%2) \n" \ - ::"r"(bS), \ - "i"(32*dir), \ - "r"(b_idx) \ - :"memory" \ - ); \ - } \ - } while (0) - -static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], - int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { - __asm__ volatile( - "movq %0, %%mm7 \n" - "movq %1, %%mm6 \n" - ::"m"(ff_pb_1), "m"(ff_pb_3) - ); - if(field) - __asm__ volatile( - "movq %0, %%mm6 \n" - ::"m"(ff_pb_3_1) - ); - __asm__ volatile( - "movq %%mm6, %%mm5 \n" - "paddb %%mm5, %%mm5 \n" - :); - - // could do a special case for dir==0 && edges==1, but it only reduces the - // average filter time by 1.2% - step <<= 3; - edges <<= 3; - h264_loop_filter_strength_iteration_mmx2(bS, nnz, ref, mv, bidir, edges, step, mask_mv1, 1, -8, 0); - h264_loop_filter_strength_iteration_mmx2(bS, nnz, ref, mv, bidir, 32, 8, mask_mv0, 0, -1, -1); - - __asm__ volatile( - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm1 \n\t" - "movq 16(%0), %%mm2 \n\t" - "movq 24(%0), %%mm3 \n\t" - TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) - "movq %%mm0, (%0) \n\t" - "movq %%mm3, 8(%0) \n\t" - "movq %%mm4, 16(%0) \n\t" - "movq %%mm2, 24(%0) \n\t" - ::"r"(bS[0]) - :"memory" - ); -} +void ff_h264_loop_filter_strength_mmx2(int16_t bS[2][4][4], uint8_t nnz[40], + int8_t ref[2][40], int16_t mv[2][40][2], + int bidir, int edges, int step, + int mask_mv0, int mask_mv1, int field); #define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \ @@ -344,12 +196,12 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom { int mm_flags = av_get_cpu_flags(); +#if HAVE_YASM if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) { - c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; + c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmx2; } if (bit_depth == 8) { -#if HAVE_YASM if (mm_flags & AV_CPU_FLAG_MMX) { c->h264_idct_dc_add = c->h264_idct_add = ff_h264_idct_add_8_mmx; @@ -510,6 +362,6 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom #endif /* HAVE_AVX */ } } -#endif } +#endif } -- cgit v1.2.3