summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorClément Bœsch <u@pkh.me>2014-08-14 22:30:55 +0200
committerClément Bœsch <u@pkh.me>2014-08-23 10:18:53 +0200
commit45c7f3997ea11c3d1007b2126b1c0049a8c27105 (patch)
tree4b197592039c1b5cfd458db0e12b604f94e0dca4
parentc82a288f8747a92278ba2e1a8c30380c18254bbd (diff)
avutil/pixelutils: faster pixelutils_sad_[au]_16x16
~560 → ~500 decicycles This is following the comments from Michael in https://ffmpeg.org/pipermail/ffmpeg-devel/2014-August/160599.html Using 2 registers for accumulator didn't help. On the other hand, some re-ordering between the movs and psadbw allowed going ~538 to ~500.
-rw-r--r--libavutil/x86/pixelutils.asm14
1 files changed, 9 insertions, 5 deletions
diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm
index 8ab0a18355..15213d92d8 100644
--- a/libavutil/x86/pixelutils.asm
+++ b/libavutil/x86/pixelutils.asm
@@ -134,16 +134,20 @@ cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
%macro SAD_XMM_16x16 1
INIT_XMM sse2
cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
- pxor m2, m2
-%rep 8
- mov%1 m0, [src2q]
+ mov%1 m2, [src2q]
+ psadbw m2, [src1q]
mov%1 m1, [src2q + stride2q]
- psadbw m0, [src1q]
psadbw m1, [src1q + stride1q]
- paddw m2, m0
paddw m2, m1
+%rep 7
lea src1q, [src1q + 2*stride1q]
lea src2q, [src2q + 2*stride2q]
+ mov%1 m0, [src2q]
+ psadbw m0, [src1q]
+ mov%1 m1, [src2q + stride2q]
+ psadbw m1, [src1q + stride1q]
+ paddw m2, m0
+ paddw m2, m1
%endrep
movhlps m0, m2
paddw m2, m0