From 45c7f3997ea11c3d1007b2126b1c0049a8c27105 Mon Sep 17 00:00:00 2001
From: Clément Bœsch <u@pkh.me>
Date: Thu, 14 Aug 2014 22:30:55 +0200
Subject: avutil/pixelutils: faster pixelutils_sad_[au]_16x16
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

~560 → ~500 decicycles

This is following the comments from Michael in
https://ffmpeg.org/pipermail/ffmpeg-devel/2014-August/160599.html

Using 2 registers for accumulator didn't help. On the other hand,
some re-ordering between the movs and psadbw allowed going ~538 to ~500.
---
 libavutil/x86/pixelutils.asm | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'libavutil/x86/pixelutils.asm')

diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm
index 8ab0a18355..15213d92d8 100644
--- a/libavutil/x86/pixelutils.asm
+++ b/libavutil/x86/pixelutils.asm
@@ -134,16 +134,20 @@ cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
 %macro SAD_XMM_16x16 1
 INIT_XMM sse2
 cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
-    pxor        m2, m2
-%rep 8
-    mov%1       m0, [src2q]
+    mov%1       m2, [src2q]
+    psadbw      m2, [src1q]
     mov%1       m1, [src2q + stride2q]
-    psadbw      m0, [src1q]
     psadbw      m1, [src1q + stride1q]
-    paddw       m2, m0
     paddw       m2, m1
+%rep 7
     lea         src1q, [src1q + 2*stride1q]
     lea         src2q, [src2q + 2*stride2q]
+    mov%1       m0, [src2q]
+    psadbw      m0, [src1q]
+    mov%1       m1, [src2q + stride2q]
+    psadbw      m1, [src1q + stride1q]
+    paddw       m2, m0
+    paddw       m2, m1
 %endrep
     movhlps     m0, m2
     paddw       m2, m0
-- 
cgit v1.2.3