From 45c7f3997ea11c3d1007b2126b1c0049a8c27105 Mon Sep 17 00:00:00 2001 From: Clément Bœsch Date: Thu, 14 Aug 2014 22:30:55 +0200 Subject: avutil/pixelutils: faster pixelutils_sad_[au]_16x16 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ~560 → ~500 decicycles This is following the comments from Michael in https://ffmpeg.org/pipermail/ffmpeg-devel/2014-August/160599.html Using 2 registers for accumulator didn't help. On the other hand, some re-ordering between the movs and psadbw allowed going ~538 to ~500. --- libavutil/x86/pixelutils.asm | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'libavutil/x86/pixelutils.asm') diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm index 8ab0a18355..15213d92d8 100644 --- a/libavutil/x86/pixelutils.asm +++ b/libavutil/x86/pixelutils.asm @@ -134,16 +134,20 @@ cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 %macro SAD_XMM_16x16 1 INIT_XMM sse2 cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 - pxor m2, m2 -%rep 8 - mov%1 m0, [src2q] + mov%1 m2, [src2q] + psadbw m2, [src1q] mov%1 m1, [src2q + stride2q] - psadbw m0, [src1q] psadbw m1, [src1q + stride1q] - paddw m2, m0 paddw m2, m1 +%rep 7 lea src1q, [src1q + 2*stride1q] lea src2q, [src2q + 2*stride2q] + mov%1 m0, [src2q] + psadbw m0, [src1q] + mov%1 m1, [src2q + stride2q] + psadbw m1, [src1q + stride1q] + paddw m2, m0 + paddw m2, m1 %endrep movhlps m0, m2 paddw m2, m0 -- cgit v1.2.3