From e25dee602ff3be582e3a6092c65d08cd6310a103 Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Mon, 26 Jul 2010 19:34:00 +0000 Subject: VP8: Much faster SSE2 MC 5-10% faster or more on Phenom, Athlon 64, and some others. Helps some on pre-SSSE3 Intel chips as well, but not as much. Originally committed as revision 24513 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/x86/vp8dsp.asm | 166 ++++++++++++++++++++++------------------------ 1 file changed, 78 insertions(+), 88 deletions(-) (limited to 'libavcodec') diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 1214438395..166cd3b153 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -438,48 +438,43 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 jg .nextrow REP_RET -; 4x4 block, H-only 4-tap filter INIT_XMM -cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 - shl r5d, 4 +cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 + shl r5d, 5 %ifdef PIC - lea r11, [fourtap_filter_hw_m] + lea r11, [fourtap_filter_v_m] %endif - mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words - mova m6, [fourtap_filter_hw+r5] + lea r5, [fourtap_filter_v+r5-32] pxor m7, m7 - + mova m4, [pw_64] + mova m5, [r5+ 0] + mova m6, [r5+16] +%ifdef m8 + mova m8, [r5+32] + mova m9, [r5+48] +%endif .nextrow - movh m0, [r2-1] - punpcklbw m0, m7 ; ABCDEFGH - mova m1, m0 - mova m2, m0 - mova m3, m0 - psrldq m1, 2 ; BCDEFGH - psrldq m2, 4 ; CDEFGH - psrldq m3, 6 ; DEFGH - punpcklwd m0, m1 ; ABBCCDDE - punpcklwd m2, m3 ; CDDEEFFG - pmaddwd m0, m5 - pmaddwd m2, m6 - paddd m0, m2 - - movh m1, [r2+3] - punpcklbw m1, m7 ; ABCDEFGH - mova m2, m1 - mova m3, m1 - mova m4, m1 - psrldq m2, 2 ; BCDEFGH - psrldq m3, 4 ; CDEFGH - psrldq m4, 6 ; DEFGH - punpcklwd m1, m2 ; ABBCCDDE - punpcklwd m3, m4 ; CDDEEFFG - pmaddwd m1, m5 - pmaddwd m3, m6 - paddd m1, m3 - - packssdw m0, m1 - paddsw m0, [pw_64] + movq m0, [r2-1] + movq m1, [r2-0] + movq m2, [r2+1] + movq m3, [r2+2] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + pmullw m0, m5 + pmullw m1, m6 +%ifdef m8 + pmullw m2, m8 + pmullw m3, m9 +%else + pmullw m2, [r5+32] + pmullw m3, [r5+48] +%endif + paddsw m0, m1 + paddsw m2, m3 + paddsw m0, m2 + paddsw m0, m4 psraw m0, 7 packuswb m0, m7 movh [r0], m0 ; store @@ -491,62 +486,57 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 8 jg .nextrow REP_RET -cglobal put_vp8_epel8_h6_sse2, 6, 6, 8 +cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 lea r5d, [r5*3] + shl r5d, 4 %ifdef PIC - lea r11, [sixtap_filter_hw_m] + lea r11, [sixtap_filter_v_m] %endif - lea r5, [sixtap_filter_hw+r5*8] + lea r5, [sixtap_filter_v+r5-96] pxor m7, m7 - + mova m6, [pw_64] +%ifdef m8 + mova m8, [r5+ 0] + mova m9, [r5+16] + mova m10, [r5+32] + mova m11, [r5+48] + mova m12, [r5+64] + mova m13, [r5+80] +%endif .nextrow - movu m0, [r2-2] - mova m6, m0 - mova m4, m0 - punpcklbw m0, m7 ; ABCDEFGHI - mova m1, m0 - mova m2, m0 - mova m3, m0 - psrldq m1, 2 ; BCDEFGH - psrldq m2, 4 ; CDEFGH - psrldq m3, 6 ; DEFGH - psrldq m4, 4 - punpcklbw m4, m7 ; EFGH - mova m5, m4 - psrldq m5, 2 ; FGH - punpcklwd m0, m1 ; ABBCCDDE - punpcklwd m2, m3 ; CDDEEFFG - punpcklwd m4, m5 ; EFFGGHHI - pmaddwd m0, [r5-48] - pmaddwd m2, [r5-32] - pmaddwd m4, [r5-16] - paddd m0, m2 - paddd m0, m4 - - psrldq m6, 4 - mova m4, m6 - punpcklbw m6, m7 ; ABCDEFGHI - mova m1, m6 - mova m2, m6 - mova m3, m6 - psrldq m1, 2 ; BCDEFGH - psrldq m2, 4 ; CDEFGH - psrldq m3, 6 ; DEFGH - psrldq m4, 4 - punpcklbw m4, m7 ; EFGH - mova m5, m4 - psrldq m5, 2 ; FGH - punpcklwd m6, m1 ; ABBCCDDE - punpcklwd m2, m3 ; CDDEEFFG - punpcklwd m4, m5 ; EFFGGHHI - pmaddwd m6, [r5-48] - pmaddwd m2, [r5-32] - pmaddwd m4, [r5-16] - paddd m6, m2 - paddd m6, m4 - - packssdw m0, m6 - paddsw m0, [pw_64] + movq m0, [r2-2] + movq m1, [r2-1] + movq m2, [r2-0] + movq m3, [r2+1] + movq m4, [r2+2] + movq m5, [r2+3] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 +%ifdef m8 + pmullw m0, m8 + pmullw m1, m9 + pmullw m2, m10 + pmullw m3, m11 + pmullw m4, m12 + pmullw m5, m13 +%else + pmullw m0, [r5+ 0] + pmullw m1, [r5+16] + pmullw m2, [r5+32] + pmullw m3, [r5+48] + pmullw m4, [r5+64] + pmullw m5, [r5+80] +%endif + paddsw m1, m4 + paddsw m0, m5 + paddsw m1, m2 + paddsw m0, m3 + paddsw m0, m1 + paddsw m0, m6 psraw m0, 7 packuswb m0, m7 movh [r0], m0 ; store -- cgit v1.2.3