From f9888520cc853fe27cbe6f3827b2723c63299a4f Mon Sep 17 00:00:00 2001 From: Christophe GISQUET Date: Mon, 26 Mar 2012 02:29:09 +0200 Subject: vp8dsp x86: perform rounding shift with a single instruction Signed-off-by: Ronald S. Bultje --- libavcodec/x86/vp8dsp.asm | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'libavcodec/x86/vp8dsp.asm') diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 4cfeb3e330..82f21fefae 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -144,6 +144,8 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 +pw_256: times 8 dw 256 + pw_20091: times 4 dw 20091 pw_17734: times 4 dw 17734 @@ -205,8 +207,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h pmaddubsw m2, m7 paddsw m0, m1 paddsw m0, m2 - paddsw m0, [pw_64] - psraw m0, 7 + pmulhrsw m0, [pw_256] packuswb m0, m0 movh [dstq], m0 ; store @@ -219,7 +220,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg shl mxd, 4 - mova m2, [pw_64] + mova m2, [pw_256] mova m3, [filter_h2_shuf] mova m4, [filter_h4_shuf] %ifdef PIC @@ -235,9 +236,8 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m6 - paddsw m0, m2 paddsw m0, m1 - psraw m0, 7 + pmulhrsw m0, m2 packuswb m0, m0 movh [dstq], m0 ; store @@ -255,7 +255,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr %endif mova m5, [fourtap_filter_hb+myq-16] mova m6, [fourtap_filter_hb+myq] - mova m7, [pw_64] + mova m7, [pw_256] ; read 3 lines sub srcq, srcstrideq @@ -275,8 +275,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr pmaddubsw m2, m6 paddsw m4, m2 mova m2, m3 - paddsw m4, m7 - psraw m4, 7 + pmulhrsw m4, m7 packuswb m4, m4 movh [dstq], m4 @@ -319,9 +318,8 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr paddsw m6, m1 paddsw m6, m7 mova m1, m2 - paddsw m6, [pw_64] mova m2, m3 - psraw m6, 7 + pmulhrsw m6, [pw_256] mova m3, m4 packuswb m6, m6 mova m4, m5 -- cgit v1.2.3