From f0aca50e0b21d7c97b091f8e551719e0da574e12 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet Date: Thu, 22 May 2014 17:48:19 +0000 Subject: x86: hpeldsp: implement SSE2 versions Those are mostly used in codecs older than H.264, eg MPEG-2. put16 versions: mmx mmx2 sse2 x2: 1888 1185 552 y2: 1778 1092 510 avg16 xy2: 3509(mmx2) -> 2169(sse2) Signed-off-by: Michael Niedermayer --- libavcodec/x86/hpeldsp.asm | 115 +++++++++++++++++++++++++++++++++------------ 1 file changed, 85 insertions(+), 30 deletions(-) (limited to 'libavcodec/x86/hpeldsp.asm') diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 2adead218c..1d26c4516e 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -35,21 +35,39 @@ SECTION_TEXT ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro PUT_PIXELS8_X2 0 +%if cpuflag(sse2) +cglobal put_pixels16_x2, 4,5,4 +%else cglobal put_pixels8_x2, 4,5 +%endif lea r4, [r2*2] .loop: - mova m0, [r1] - mova m1, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m1, [r1+r2+1] + movu m0, [r1+1] + movu m1, [r1+r2+1] +%if cpuflag(sse2) + movu m2, [r1] + movu m3, [r1+r2] + pavgb m0, m2 + pavgb m1, m3 +%else + PAVGB m0, [r1] + PAVGB m1, [r1+r2] +%endif mova [r0], m0 mova [r0+r2], m1 add r1, r4 add r0, r4 - mova m0, [r1] - mova m1, [r1+r2] - PAVGB m0, [r1+1] - PAVGB m1, [r1+r2+1] + movu m0, [r1+1] + movu m1, [r1+r2+1] +%if cpuflag(sse2) + movu m2, [r1] + movu m3, [r1+r2] + pavgb m0, m2 + pavgb m1, m3 +%else + PAVGB m0, [r1] + PAVGB m1, [r1+r2] +%endif add r1, r4 mova [r0], m0 mova [r0+r2], m1 @@ -107,6 +125,9 @@ INIT_MMX mmxext PUT_PIXELS_16 INIT_MMX 3dnow PUT_PIXELS_16 +; The 8_X2 macro can easily be used here +INIT_XMM sse2 +PUT_PIXELS8_X2 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -199,20 +220,24 @@ PUT_NO_RND_PIXELS8_X2_EXACT ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro PUT_PIXELS8_Y2 0 +%if cpuflag(sse2) +cglobal put_pixels16_y2, 4,5,3 +%else cglobal put_pixels8_y2, 4,5 +%endif lea r4, [r2*2] - mova m0, [r1] + movu m0, [r1] sub r0, r2 .loop: - mova m1, [r1+r2] - mova m2, [r1+r4] + movu m1, [r1+r2] + movu m2, [r1+r4] add r1, r4 PAVGB m0, m1 PAVGB m1, m2 mova [r0+r2], m0 mova [r0+r4], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] + movu m1, [r1+r2] + movu m0, [r1+r4] add r0, r4 add r1, r4 PAVGB m2, m1 @@ -229,6 +254,9 @@ INIT_MMX mmxext PUT_PIXELS8_Y2 INIT_MMX 3dnow PUT_PIXELS8_Y2 +; actually, put_pixels16_y2_sse2 +INIT_XMM sse2 +PUT_PIXELS8_Y2 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -352,34 +380,50 @@ AVG_PIXELS8 %endmacro %macro AVG_PIXELS8_X2 0 +%if cpuflag(sse2) +cglobal avg_pixels16_x2, 4,5,4 +%else cglobal avg_pixels8_x2, 4,5 +%endif lea r4, [r2*2] %if notcpuflag(mmxext) pcmpeqd m5, m5 paddb m5, m5 %endif .loop: - mova m0, [r1] - mova m2, [r1+r2] + movu m0, [r1] + movu m2, [r1+r2] %if notcpuflag(mmxext) PAVGB_MMX [r1+1], m0, m3, m5 PAVGB_MMX [r1+r2+1], m2, m4, m5 PAVGB_MMX [r0], m0, m3, m5 PAVGB_MMX [r0+r2], m2, m4, m5 +%else +%if cpuflag(sse2) + movu m1, [r1+1] + movu m3, [r1+r2+1] + pavgb m0, m1 + pavgb m2, m3 %else PAVGB m0, [r1+1] PAVGB m2, [r1+r2+1] +%endif PAVGB m0, [r0] PAVGB m2, [r0+r2] %endif add r1, r4 mova [r0], m0 mova [r0+r2], m2 - mova m0, [r1] - mova m2, [r1+r2] + movu m0, [r1] + movu m2, [r1+r2] %if notcpuflag(mmxext) PAVGB_MMX [r1+1], m0, m3, m5 PAVGB_MMX [r1+r2+1], m2, m4, m5 +%elif cpuflag(sse2) + movu m1, [r1+1] + movu m3, [r1+r2+1] + pavgb m0, m1 + pavgb m2, m3 %else PAVGB m0, [r1+1] PAVGB m2, [r1+r2+1] @@ -389,6 +433,9 @@ cglobal avg_pixels8_x2, 4,5 %if notcpuflag(mmxext) PAVGB_MMX [r0], m0, m3, m5 PAVGB_MMX [r0+r2], m2, m4, m5 +%elif cpuflag(sse2) + pavgb m0, [r0] + pavgb m2, [r0+r2] %else PAVGB m0, [r0] PAVGB m2, [r0+r2] @@ -407,36 +454,39 @@ INIT_MMX mmxext AVG_PIXELS8_X2 INIT_MMX 3dnow AVG_PIXELS8_X2 +; actually avg_pixels16_x2 +INIT_XMM sse2 +AVG_PIXELS8_X2 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) %macro AVG_PIXELS8_Y2 0 +%if cpuflag(sse2) +cglobal avg_pixels16_y2, 4,5,3 +%else cglobal avg_pixels8_y2, 4,5 +%endif lea r4, [r2*2] - mova m0, [r1] + movu m0, [r1] sub r0, r2 .loop: - mova m1, [r1+r2] - mova m2, [r1+r4] + movu m1, [r1+r2] + movu m2, [r1+r4] add r1, r4 PAVGB m0, m1 PAVGB m1, m2 - mova m3, [r0+r2] - mova m4, [r0+r4] - PAVGB m0, m3 - PAVGB m1, m4 + PAVGB m0, [r0+r2] + PAVGB m1, [r0+r4] mova [r0+r2], m0 mova [r0+r4], m1 - mova m1, [r1+r2] - mova m0, [r1+r4] + movu m1, [r1+r2] + movu m0, [r1+r4] PAVGB m2, m1 PAVGB m1, m0 add r0, r4 add r1, r4 - mova m3, [r0+r2] - mova m4, [r0+r4] - PAVGB m2, m3 - PAVGB m1, m4 + PAVGB m2, [r0+r2] + PAVGB m1, [r0+r4] mova [r0+r2], m2 mova [r0+r4], m1 add r0, r4 @@ -449,6 +499,9 @@ INIT_MMX mmxext AVG_PIXELS8_Y2 INIT_MMX 3dnow AVG_PIXELS8_Y2 +; actually avg_pixels16_y2 +INIT_XMM sse2 +AVG_PIXELS8_Y2 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) @@ -571,3 +624,5 @@ INIT_MMX mmxext AVG_PIXELS_XY2 INIT_MMX 3dnow AVG_PIXELS_XY2 +INIT_XMM sse2 +AVG_PIXELS_XY2 -- cgit v1.2.3