summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2013-03-12 07:28:12 -0700
committerMartin Storsjö <martin@martin.st>2013-04-15 12:32:05 +0300
commit015821229f96bf7e677f2a711a58dbea3009f574 (patch)
tree2247f2d16c077a1f887656b8859b164eca6b84df /libavcodec/x86
parent5941978e71d2c3a8e2a7e87951e081e0b2e77da9 (diff)
vp3: Use full transpose for all IDCTs
This way, the special IDCT permutations are no longer needed. This is similar to how H264 does it, and removes the dsputil dependency imposed by the scantable code. Also remove the unused type == 0 cases from the plain C version of the idct. Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/vp3dsp.asm123
-rw-r--r--libavcodec/x86/vp3dsp_init.c2
2 files changed, 82 insertions, 43 deletions
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 078e9db99a..fc8a047224 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -501,22 +501,22 @@ cglobal vp3_h_loop_filter, 3, 4
; at this point, function has completed dequantization + dezigzag +
; partial transposition; now do the idct itself
-%define I(x) [%1+16* x ]
-%define J(x) [%1+16*(x-4)+8]
+%define I(x) [%1+16*x]
+%define J(x) [%1+16*x]
RowIDCT
Transpose
-%define I(x) [%1+16* x +64]
-%define J(x) [%1+16*(x-4)+72]
+%define I(x) [%1+16*x+8]
+%define J(x) [%1+16*x+8]
RowIDCT
Transpose
-%define I(x) [%1+16*x]
-%define J(x) [%1+16*x]
+%define I(x) [%1+16* x]
+%define J(x) [%1+16*(x-4)+8]
ColumnIDCT
-%define I(x) [%1+16*x+8]
-%define J(x) [%1+16*x+8]
+%define I(x) [%1+16* x +64]
+%define J(x) [%1+16*(x-4)+72]
ColumnIDCT
%endif ; mmsize == 16/8
%endmacro
@@ -534,10 +534,17 @@ cglobal vp3_idct_put, 3, 4, 9
mova m1, [r2+mmsize*2+%%i]
mova m2, [r2+mmsize*4+%%i]
mova m3, [r2+mmsize*6+%%i]
+%if mmsize == 8
+ packsswb m0, [r2+mmsize*8+%%i]
+ packsswb m1, [r2+mmsize*10+%%i]
+ packsswb m2, [r2+mmsize*12+%%i]
+ packsswb m3, [r2+mmsize*14+%%i]
+%else
packsswb m0, [r2+mmsize*1+%%i]
packsswb m1, [r2+mmsize*3+%%i]
packsswb m2, [r2+mmsize*5+%%i]
packsswb m3, [r2+mmsize*7+%%i]
+%endif
paddb m0, m4
paddb m1, m4
paddb m2, m4
@@ -561,7 +568,7 @@ cglobal vp3_idct_put, 3, 4, 9
movq [r0+r1*2], m3
movhps [r0+r3 ], m3
%endif
-%assign %%i %%i+64
+%assign %%i %%i+8
%endrep
pxor m0, m0
@@ -575,47 +582,81 @@ cglobal vp3_idct_put, 3, 4, 9
cglobal vp3_idct_add, 3, 4, 9
VP3_IDCT r2
- mov r3, 4
- pxor m4, m4
movsxdifnidn r1, r1d
-.loop:
+ lea r3, [r1*3]
+ pxor m4, m4
+%if mmsize == 16
+%assign %%i 0
+%rep 2
movq m0, [r0]
movq m1, [r0+r1]
-%if mmsize == 8
- mova m2, m0
- mova m3, m1
-%endif
+ movq m2, [r0+r1*2]
+ movq m3, [r0+r3]
punpcklbw m0, m4
punpcklbw m1, m4
-%if mmsize == 8
- punpckhbw m2, m4
- punpckhbw m3, m4
-%endif
- paddsw m0, [r2+ 0]
- paddsw m1, [r2+16]
-%if mmsize == 8
- paddsw m2, [r2+ 8]
- paddsw m3, [r2+24]
- packuswb m0, m2
- packuswb m1, m3
-%else ; mmsize == 16
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ paddsw m0, [r2+ 0+%%i]
+ paddsw m1, [r2+16+%%i]
+ paddsw m2, [r2+32+%%i]
+ paddsw m3, [r2+48+%%i]
packuswb m0, m1
+ packuswb m2, m3
+ movq [r0 ], m0
+ movhps [r0+r1 ], m0
+ movq [r0+r1*2], m2
+ movhps [r0+r3 ], m2
+%if %%i == 0
+ lea r0, [r0+r1*4]
%endif
- movq [r0 ], m0
-%if mmsize == 8
- movq [r0+r1], m1
-%else ; mmsize == 16
- movhps [r0+r1], m0
+%assign %%i %%i+64
+%endrep
+%else
+%assign %%i 0
+%rep 2
+ movq m0, [r0]
+ movq m1, [r0+r1]
+ movq m2, [r0+r1*2]
+ movq m3, [r0+r3]
+ movq m5, m0
+ movq m6, m1
+ movq m7, m2
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpckhbw m5, m4
+ punpckhbw m6, m4
+ punpckhbw m7, m4
+ paddsw m0, [r2+ 0+%%i]
+ paddsw m1, [r2+16+%%i]
+ paddsw m2, [r2+32+%%i]
+ paddsw m5, [r2+64+%%i]
+ paddsw m6, [r2+80+%%i]
+ paddsw m7, [r2+96+%%i]
+ packuswb m0, m5
+ movq m5, m3
+ punpcklbw m3, m4
+ punpckhbw m5, m4
+ packuswb m1, m6
+ paddsw m3, [r2+48+%%i]
+ paddsw m5, [r2+112+%%i]
+ packuswb m2, m7
+ packuswb m3, m5
+ movq [r0 ], m0
+ movq [r0+r1 ], m1
+ movq [r0+r1*2], m2
+ movq [r0+r3 ], m3
+%if %%i == 0
+ lea r0, [r0+r1*4]
%endif
- lea r0, [r0+r1*2]
-%assign %%offset 0
-%rep 32/mmsize
- mova [r2+%%offset], m4
-%assign %%offset %%offset+mmsize
+%assign %%i %%i+8
+%endrep
+%endif
+%assign %%i 0
+%rep 128/mmsize
+ mova [r2+%%i], m4
+%assign %%i %%i+mmsize
%endrep
- add r2, 32
- dec r3
- jg .loop
RET
%endmacro
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 2668bcf5b9..cc52fbccdd 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -48,7 +48,6 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
if (EXTERNAL_MMX(cpuflags)) {
c->idct_put = ff_vp3_idct_put_mmx;
c->idct_add = ff_vp3_idct_add_mmx;
- c->idct_perm = FF_PARTTRANS_IDCT_PERM;
}
#endif
@@ -64,6 +63,5 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
if (EXTERNAL_SSE2(cpuflags)) {
c->idct_put = ff_vp3_idct_put_sse2;
c->idct_add = ff_vp3_idct_add_sse2;
- c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
}
}