summaryrefslogtreecommitdiff
path: root/libavcodec/x86/vp3dsp.asm
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2012-07-26 21:36:03 +0200
committerMichael Niedermayer <michaelni@gmx.at>2012-07-26 21:37:15 +0200
commit7333798c85837f1cf175f39bc4acb5664fa6cacc (patch)
tree60036638a0962b3cb966d62da2eda81f93ac3267 /libavcodec/x86/vp3dsp.asm
parent307a20cca216356aec30f5bb102c633169cbc0c1 (diff)
parent44dc9c6af0377faf2a99889d1f949e32a1102e84 (diff)
Merge remote-tracking branch 'qatar/master'
* qatar/master: libopenjpeg: support YUV and deep RGB pixel formats Fix typo in v410 decoder. vf_yadif: unset cur_buf on the input link. vf_overlay: ensure the overlay frame does not get leaked. vf_overlay: prevent premature freeing of cur_buf Support urlencoded http authentication credentials rtmp: Return an error when the client bandwidth is incorrect rtmp: Return proper error code in handle_server_bw rtmp: Return proper error code in handle_client_bw rtmp: Return proper error codes in handle_chunk_size lavr: x86: add missing vzeroupper in ff_mix_1_to_2_fltp_flt() vp8: Replace x*155/100 by x*101581>>16. vp3: don't use calls to inline asm in yasm code. x86/dsputil: put inline asm under HAVE_INLINE_ASM. dsputil_mmx: fix incorrect assembly code rtmp: Factorize the code by adding handle_invoke rtmp: Factorize the code by adding handle_chunk_size rtmp: Factorize the code by adding handle_ping rtmp: Factorize the code by adding handle_client_bw rtmp: Factorize the code by adding handle_server_bw Conflicts: libavcodec/libopenjpegdec.c libavcodec/x86/dsputil_mmx.c libavfilter/vf_overlay.c libavformat/Makefile libavformat/version.h Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/vp3dsp.asm')
-rw-r--r--libavcodec/x86/vp3dsp.asm120
1 files changed, 79 insertions, 41 deletions
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 0e0bd29a99..46bd9d8f86 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -38,13 +38,11 @@ cextern pb_1
cextern pb_3
cextern pb_7
cextern pb_1F
+cextern pb_80
cextern pb_81
cextern pw_8
-cextern put_signed_pixels_clamped_mmx
-cextern add_pixels_clamped_mmx
-
SECTION .text
; this is off by one or two for some cases when filter_limit is greater than 63
@@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%endmacro
-%macro vp3_idct_funcs 3
-cglobal vp3_idct_put_%1, 3, %3, %2
+%macro vp3_idct_funcs 1
+cglobal vp3_idct_put_%1, 3, 4, 9
VP3_IDCT_%1 r2
-%if ARCH_X86_64
- mov r3, r2
- mov r2, r1
- mov r1, r0
- mov r0, r3
+
+ movsxdifnidn r1, r1d
+ mova m4, [pb_80]
+ lea r3, [r1*3]
+%assign %%i 0
+%rep 16/mmsize
+ mova m0, [r2+mmsize*0+%%i]
+ mova m1, [r2+mmsize*2+%%i]
+ mova m2, [r2+mmsize*4+%%i]
+ mova m3, [r2+mmsize*6+%%i]
+ packsswb m0, [r2+mmsize*1+%%i]
+ packsswb m1, [r2+mmsize*3+%%i]
+ packsswb m2, [r2+mmsize*5+%%i]
+ packsswb m3, [r2+mmsize*7+%%i]
+ paddb m0, m4
+ paddb m1, m4
+ paddb m2, m4
+ paddb m3, m4
+ movq [r0 ], m0
+%if mmsize == 8
+ movq [r0+r1 ], m1
+ movq [r0+r1*2], m2
+ movq [r0+r3 ], m3
%else
- mov r0m, r2
- mov r1m, r0
- mov r2m, r1
+ movhps [r0+r1 ], m0
+ movq [r0+r1*2], m1
+ movhps [r0+r3 ], m1
%endif
-%if WIN64
- call put_signed_pixels_clamped_mmx
- RET
-%else
- jmp put_signed_pixels_clamped_mmx
+%if %%i == 0
+ lea r0, [r0+r1*4]
+%endif
+%if mmsize == 16
+ movq [r0 ], m2
+ movhps [r0+r1 ], m2
+ movq [r0+r1*2], m3
+ movhps [r0+r3 ], m3
%endif
+%assign %%i %%i+64
+%endrep
+ RET
-cglobal vp3_idct_add_%1, 3, %3, %2
+cglobal vp3_idct_add_%1, 3, 4, 9
VP3_IDCT_%1 r2
-%if ARCH_X86_64
- mov r3, r2
- mov r2, r1
- mov r1, r0
- mov r0, r3
-%else
- mov r0m, r2
- mov r1m, r0
- mov r2m, r1
+
+ mov r3, 4
+ pxor m4, m4
+ movsxdifnidn r1, r1d
+.loop:
+ movq m0, [r0]
+ movq m1, [r0+r1]
+%if mmsize == 8
+ mova m2, m0
+ mova m3, m1
%endif
-%if WIN64
- call add_pixels_clamped_mmx
- RET
-%else
- jmp add_pixels_clamped_mmx
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+%if mmsize == 8
+ punpckhbw m2, m4
+ punpckhbw m3, m4
+%endif
+ paddsw m0, [r2+ 0]
+ paddsw m1, [r2+16]
+%if mmsize == 8
+ paddsw m2, [r2+ 8]
+ paddsw m3, [r2+24]
+ packuswb m0, m2
+ packuswb m1, m3
+%else ; mmsize == 16
+ packuswb m0, m1
%endif
+ movq [r0 ], m0
+%if mmsize == 8
+ movq [r0+r1], m1
+%else ; mmsize == 16
+ movhps [r0+r1], m0
+%endif
+ lea r0, [r0+r1*2]
+ add r2, 32
+ dec r3
+ jg .loop
+ RET
%endmacro
-%if ARCH_X86_64
-%define REGS 4
-%else
-%define REGS 3
-%endif
INIT_MMX
-vp3_idct_funcs mmx, 0, REGS
+vp3_idct_funcs mmx
INIT_XMM
-vp3_idct_funcs sse2, 9, REGS
-%undef REGS
+vp3_idct_funcs sse2
%macro DC_ADD 0
movq m2, [r0 ]