summaryrefslogtreecommitdiff
path: root/libavcodec/x86/vp3dsp.asm
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2012-07-22 20:38:56 +0000
committerDerek Buitenhuis <derek.buitenhuis@gmail.com>2012-07-25 14:24:30 -0400
commita1878a88a1dc3e3b0abaee910a18f0a6a30b0805 (patch)
tree6fc53e4db758f6955638c5183442b400ff50b875 /libavcodec/x86/vp3dsp.asm
parent79195ce56500a137c7d3152d83dc27d848086405 (diff)
vp3: don't use calls to inline asm in yasm code.
Mixing yasm and inline asm is a bad idea, since if either yasm or inline asm is not supported by your toolchain, all of the asm stops working. Thus, better to use either one or the other alone. Signed-off-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
Diffstat (limited to 'libavcodec/x86/vp3dsp.asm')
-rw-r--r--libavcodec/x86/vp3dsp.asm120
1 files changed, 79 insertions, 41 deletions
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 58fa1f7b27..af2f60c6ae 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -38,13 +38,11 @@ cextern pb_1
cextern pb_3
cextern pb_7
cextern pb_1F
+cextern pb_80
cextern pb_81
cextern pw_8
-cextern put_signed_pixels_clamped_mmx
-cextern add_pixels_clamped_mmx
-
SECTION .text
; this is off by one or two for some cases when filter_limit is greater than 63
@@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%endmacro
-%macro vp3_idct_funcs 3
-cglobal vp3_idct_put_%1, 3, %3, %2
+%macro vp3_idct_funcs 1
+cglobal vp3_idct_put_%1, 3, 4, 9
VP3_IDCT_%1 r2
-%if ARCH_X86_64
- mov r3, r2
- mov r2, r1
- mov r1, r0
- mov r0, r3
+
+ movsxdifnidn r1, r1d
+ mova m4, [pb_80]
+ lea r3, [r1*3]
+%assign %%i 0
+%rep 16/mmsize
+ mova m0, [r2+mmsize*0+%%i]
+ mova m1, [r2+mmsize*2+%%i]
+ mova m2, [r2+mmsize*4+%%i]
+ mova m3, [r2+mmsize*6+%%i]
+ packsswb m0, [r2+mmsize*1+%%i]
+ packsswb m1, [r2+mmsize*3+%%i]
+ packsswb m2, [r2+mmsize*5+%%i]
+ packsswb m3, [r2+mmsize*7+%%i]
+ paddb m0, m4
+ paddb m1, m4
+ paddb m2, m4
+ paddb m3, m4
+ movq [r0 ], m0
+%if mmsize == 8
+ movq [r0+r1 ], m1
+ movq [r0+r1*2], m2
+ movq [r0+r3 ], m3
%else
- mov r0m, r2
- mov r1m, r0
- mov r2m, r1
+ movhps [r0+r1 ], m0
+ movq [r0+r1*2], m1
+ movhps [r0+r3 ], m1
%endif
-%if WIN64
- call put_signed_pixels_clamped_mmx
- RET
-%else
- jmp put_signed_pixels_clamped_mmx
+%if %%i == 0
+ lea r0, [r0+r1*4]
+%endif
+%if mmsize == 16
+ movq [r0 ], m2
+ movhps [r0+r1 ], m2
+ movq [r0+r1*2], m3
+ movhps [r0+r3 ], m3
%endif
+%assign %%i %%i+64
+%endrep
+ RET
-cglobal vp3_idct_add_%1, 3, %3, %2
+cglobal vp3_idct_add_%1, 3, 4, 9
VP3_IDCT_%1 r2
-%if ARCH_X86_64
- mov r3, r2
- mov r2, r1
- mov r1, r0
- mov r0, r3
-%else
- mov r0m, r2
- mov r1m, r0
- mov r2m, r1
+
+ mov r3, 4
+ pxor m4, m4
+ movsxdifnidn r1, r1d
+.loop:
+ movq m0, [r0]
+ movq m1, [r0+r1]
+%if mmsize == 8
+ mova m2, m0
+ mova m3, m1
%endif
-%if WIN64
- call add_pixels_clamped_mmx
- RET
-%else
- jmp add_pixels_clamped_mmx
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+%if mmsize == 8
+ punpckhbw m2, m4
+ punpckhbw m3, m4
+%endif
+ paddsw m0, [r2+ 0]
+ paddsw m1, [r2+16]
+%if mmsize == 8
+ paddsw m2, [r2+ 8]
+ paddsw m3, [r2+24]
+ packuswb m0, m2
+ packuswb m1, m3
+%else ; mmsize == 16
+ packuswb m0, m1
%endif
+ movq [r0 ], m0
+%if mmsize == 8
+ movq [r0+r1], m1
+%else ; mmsize == 16
+ movhps [r0+r1], m0
+%endif
+ lea r0, [r0+r1*2]
+ add r2, 32
+ dec r3
+ jg .loop
+ RET
%endmacro
-%if ARCH_X86_64
-%define REGS 4
-%else
-%define REGS 3
-%endif
INIT_MMX
-vp3_idct_funcs mmx, 0, REGS
+vp3_idct_funcs mmx
INIT_XMM
-vp3_idct_funcs sse2, 9, REGS
-%undef REGS
+vp3_idct_funcs sse2
%macro DC_ADD 0
movq m2, [r0 ]