summaryrefslogtreecommitdiff
path: root/libavcodec/x86/hpeldsp.asm
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2014-05-22 23:47:06 +0200
committerMichael Niedermayer <michaelni@gmx.at>2014-05-24 15:15:56 +0200
commit81aa0f4604f98da692f2689c84968f90354a92ea (patch)
tree243197f645e40a7dd5219df7992e4f8452fc73a9 /libavcodec/x86/hpeldsp.asm
parent726316240bcc41cef6053dd6d1e46a3c57328498 (diff)
x86: hpeldsp: implement SSSE3 version of _xy2
Loading pb_1 rather than pw_8192 was benchmarked to be more efficient. Loading of the 2 yields no advantage. Loading of one saves ~11 cycles. decicycles count: put8: 3223(mmx) -> 2387 avg8: 2863(mmxext) -> 2125 put16: 4356(sse2) -> 3553 avg16: 4481(sse2) -> 3513 Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/hpeldsp.asm')
-rw-r--r--libavcodec/x86/hpeldsp.asm70
1 files changed, 70 insertions, 0 deletions
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 4af423aee5..76e4632cbc 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -30,6 +30,9 @@
SECTION_RODATA
cextern pb_1
cextern pw_2
+pw_8192: times 8 dw (1<<13)
+pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
SECTION_TEXT
@@ -635,3 +638,70 @@ SET_PIXELS_XY2 avg
INIT_XMM sse2
SET_PIXELS_XY2 put
SET_PIXELS_XY2 avg
+
+%macro SSSE3_PIXELS_XY2 1-2
+%if %0 == 2 ; sse2
+cglobal %1_pixels16_xy2, 4,5,%2
+ mova m4, [pb_interleave16]
+%else
+cglobal %1_pixels8_xy2, 4,5
+ mova m4, [pb_interleave8]
+%endif
+ mova m5, [pb_1]
+ movu m0, [r1]
+ movu m1, [r1+1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ xor r4, r4
+ add r1, r2
+.loop:
+ movu m2, [r1+r4]
+ movu m3, [r1+r4+1]
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ paddusw m0, m2
+ paddusw m1, m3
+ pmulhrsw m0, [pw_8192]
+ pmulhrsw m1, [pw_8192]
+%ifidn %1, avg
+ mova m6, [r0+r4]
+ packuswb m0, m1
+ pshufb m0, m4
+ pavgb m0, m6
+%else
+ packuswb m0, m1
+ pshufb m0, m4
+%endif
+ mova [r0+r4], m0
+ add r4, r2
+
+ movu m0, [r1+r4]
+ movu m1, [r1+r4+1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ paddusw m2, m0
+ paddusw m3, m1
+ pmulhrsw m2, [pw_8192]
+ pmulhrsw m3, [pw_8192]
+%ifidn %1, avg
+ mova m6, [r0+r4]
+ packuswb m2, m3
+ pshufb m2, m4
+ pavgb m2, m6
+%else
+ packuswb m2, m3
+ pshufb m2, m4
+%endif
+ mova [r0+r4], m2
+ add r4, r2
+ sub r3d, 2
+ jnz .loop
+ REP_RET
+%endmacro
+
+INIT_MMX ssse3
+SSSE3_PIXELS_XY2 put
+SSSE3_PIXELS_XY2 avg
+INIT_XMM ssse3
+SSSE3_PIXELS_XY2 put, 6
+SSSE3_PIXELS_XY2 avg, 7