summaryrefslogtreecommitdiff
path: root/libavcodec/x86/vp9dsp.asm
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2013-09-21 22:03:00 -0400
committerRonald S. Bultje <rsbultje@gmail.com>2013-10-02 21:03:15 -0400
commitf1548c008fc5d50488bb7bbc2aa4cf49d89a0bda (patch)
tree7c11cf685ee12c80eaba56dc1deb61003833ddac /libavcodec/x86/vp9dsp.asm
parentc07ac8d467f0682d46be5d76337e2d86de21a0c2 (diff)
Full-pixel MC functions.
Decoding time of ped1080p.webm goes from 11.3sec to 11.1sec.
Diffstat (limited to 'libavcodec/x86/vp9dsp.asm')
-rw-r--r--libavcodec/x86/vp9dsp.asm57
1 files changed, 57 insertions, 0 deletions
diff --git a/libavcodec/x86/vp9dsp.asm b/libavcodec/x86/vp9dsp.asm
index 30db0ca3af..740d67cfa1 100644
--- a/libavcodec/x86/vp9dsp.asm
+++ b/libavcodec/x86/vp9dsp.asm
@@ -219,3 +219,60 @@ filter_v_fn avg
INIT_XMM ssse3
filter_v_fn put
filter_v_fn avg
+
+%macro fpel_fn 6
+%if %2 == 4
+%define %%srcfn movh
+%define %%dstfn movh
+%else
+%define %%srcfn movu
+%define %%dstfn mova
+%endif
+
+%if %2 <= 16
+cglobal %1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
+ lea sstride3q, [sstrideq*3]
+ lea dstride3q, [dstrideq*3]
+%else
+cglobal %1%2, 5, 5, 4, dst, dstride, src, sstride, h
+%endif
+.loop:
+ %%srcfn m0, [srcq]
+ %%srcfn m1, [srcq+s%3]
+ %%srcfn m2, [srcq+s%4]
+ %%srcfn m3, [srcq+s%5]
+ lea srcq, [srcq+sstrideq*%6]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+d%3]
+ pavgb m2, [dstq+d%4]
+ pavgb m3, [dstq+d%5]
+%endif
+ %%dstfn [dstq], m0
+ %%dstfn [dstq+d%3], m1
+ %%dstfn [dstq+d%4], m2
+ %%dstfn [dstq+d%5], m3
+ lea dstq, [dstq+dstrideq*%6]
+ sub hd, %6
+ jnz .loop
+ RET
+%endmacro
+
+%define d16 16
+%define s16 16
+INIT_MMX mmx
+fpel_fn put, 4, strideq, strideq*2, stride3q, 4
+fpel_fn put, 8, strideq, strideq*2, stride3q, 4
+INIT_MMX sse
+fpel_fn avg, 4, strideq, strideq*2, stride3q, 4
+fpel_fn avg, 8, strideq, strideq*2, stride3q, 4
+INIT_XMM sse
+fpel_fn put, 16, strideq, strideq*2, stride3q, 4
+fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2
+fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1
+INIT_XMM sse2
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
+fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2
+fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1
+%undef s16
+%undef d16