summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libavcodec/x86/vp9itxfm_16bpp.asm233
-rw-r--r--tests/checkasm/vp9dsp.c2
2 files changed, 201 insertions, 34 deletions
diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm
index 291b4a1880..3257986d30 100644
--- a/libavcodec/x86/vp9itxfm_16bpp.asm
+++ b/libavcodec/x86/vp9itxfm_16bpp.asm
@@ -97,6 +97,40 @@ pw_m3196_m16069: times 4 dw -3196, -16069
pw_m13623_m9102: times 4 dw -13623, -9102
pw_m6270_m15137: times 4 dw -6270, -15137
+default_8x8:
+times 12 db 1
+times 52 db 2
+row_8x8:
+times 18 db 1
+times 46 db 2
+col_8x8:
+times 6 db 1
+times 58 db 2
+default_16x16:
+times 10 db 1
+times 28 db 2
+times 51 db 3
+times 167 db 4
+row_16x16:
+times 21 db 1
+times 45 db 2
+times 60 db 3
+times 130 db 4
+col_16x16:
+times 5 db 1
+times 12 db 2
+times 25 db 3
+times 214 db 4
+default_32x32:
+times 9 db 1
+times 25 db 2
+times 36 db 3
+times 65 db 4
+times 105 db 5
+times 96 db 6
+times 112 db 7
+times 576 db 8
+
SECTION .text
%macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
@@ -636,18 +670,21 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
jg .loop_dc
RET
- ; FIXME a sub-idct for the top-left 4x4 coefficients would save 1 loop
- ; iteration in the first idct (2->1) and thus probably a lot of time.
- ; I haven't implemented that yet, though
-
.idctfull:
mova [rsp+16*mmsize], m0
- DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
%if ARCH_X86_64
mov dstbakq, dstq
+ movsxd cntq, cntd
%endif
- lea stride3q, [strideq*3]
- mov cntd, 2
+%ifdef PIC
+ lea ptrq, [default_8x8]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [default_8x8+cntq-1]
+%endif
+ mov skipd, 2
+ sub skipd, cntd
mov ptrq, rsp
.loop_1:
IDCT8_1D blockq
@@ -668,6 +705,24 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
dec cntd
jg .loop_1
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ add ptrq, 4 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
mov cntd, 2
mov ptrq, rsp
.loop_2:
@@ -854,20 +909,27 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
SWAP 2, 7, 6
%endmacro
-%macro IADST8_FN 4
-cglobal vp9_%1_%3_8x8_add_10, 3, 6 + ARCH_X86_64, 13, \
- 17 * mmsize + ARCH_X86_32 * 5 * mmsize, \
+%macro IADST8_FN 5
+cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
+ 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
dst, stride, block, eob
mova m0, [pw_1023]
.body:
mova [rsp+16*mmsize], m0
- DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
%if ARCH_X86_64
mov dstbakq, dstq
+ movsxd cntq, cntd
%endif
- lea stride3q, [strideq*3]
- mov cntd, 2
+%ifdef PIC
+ lea ptrq, [%5_8x8]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [%5_8x8+cntq-1]
+%endif
+ mov skipd, 2
+ sub skipd, cntd
mov ptrq, rsp
.loop_1:
%2_1D blockq
@@ -888,6 +950,24 @@ cglobal vp9_%1_%3_8x8_add_10, 3, 6 + ARCH_X86_64, 13, \
dec cntd
jg .loop_1
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ add ptrq, 4 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
mov cntd, 2
mov ptrq, rsp
.loop_2:
@@ -913,17 +993,17 @@ cglobal vp9_%1_%3_8x8_add_10, 3, 6 + ARCH_X86_64, 13, \
ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
RET
-cglobal vp9_%1_%3_8x8_add_12, 3, 6 + ARCH_X86_64, 13, \
- 17 * mmsize + ARCH_X86_32 * 5 * mmsize, \
+cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \
+ 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
dst, stride, block, eob
mova m0, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body
%endmacro
INIT_XMM sse2
-IADST8_FN idct, IDCT8, iadst, IADST8
-IADST8_FN iadst, IADST8, idct, IDCT8
-IADST8_FN iadst, IADST8, iadst, IADST8
+IADST8_FN idct, IDCT8, iadst, IADST8, row
+IADST8_FN iadst, IADST8, idct, IDCT8, col
+IADST8_FN iadst, IADST8, iadst, IADST8, default
%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
IDCT8_1D %1, %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
@@ -1040,12 +1120,19 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
.idctfull:
mova [rsp+64*mmsize], m0
- DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
%if ARCH_X86_64
mov dstbakq, dstq
+ movsxd cntq, cntd
%endif
- lea stride3q, [strideq*3]
- mov cntd, 4
+%ifdef PIC
+ lea ptrq, [default_16x16]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [default_16x16+cntq-1]
+%endif
+ mov skipd, 4
+ sub skipd, cntd
mov ptrq, rsp
.loop_1:
IDCT16_1D blockq
@@ -1084,6 +1171,28 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
dec cntd
jg .loop_1
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ mova [ptrq+mmsize*4], m0
+ mova [ptrq+mmsize*5], m0
+ mova [ptrq+mmsize*6], m0
+ mova [ptrq+mmsize*7], m0
+ add ptrq, 8 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
mov cntd, 4
mov ptrq, rsp
.loop_2:
@@ -1318,20 +1427,27 @@ cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
SWAP 2, 5, 4, 6, 7, 3
%endmacro
-%macro IADST16_FN 6
-cglobal vp9_%1_%4_16x16_add_10, 3, 6 + ARCH_X86_64, 16, \
+%macro IADST16_FN 7
+cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
dst, stride, block, eob
mova m0, [pw_1023]
.body:
mova [rsp+64*mmsize], m0
- DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
%if ARCH_X86_64
mov dstbakq, dstq
+ movsxd cntq, cntd
%endif
- lea stride3q, [strideq*3]
- mov cntd, 4
+%ifdef PIC
+ lea ptrq, [%7_16x16]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [%7_16x16+cntq-1]
+%endif
+ mov skipd, 4
+ sub skipd, cntd
mov ptrq, rsp
.loop_1:
%2_1D blockq
@@ -1370,6 +1486,28 @@ cglobal vp9_%1_%4_16x16_add_10, 3, 6 + ARCH_X86_64, 16, \
dec cntd
jg .loop_1
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ add skipd, skipd
+ lea blockq, [blockq+skipq*(mmsize/2)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ mova [ptrq+mmsize*4], m0
+ mova [ptrq+mmsize*5], m0
+ mova [ptrq+mmsize*6], m0
+ mova [ptrq+mmsize*7], m0
+ add ptrq, 8 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
mov cntd, 4
mov ptrq, rsp
.loop_2:
@@ -1419,7 +1557,7 @@ cglobal vp9_%1_%4_16x16_add_10, 3, 6 + ARCH_X86_64, 16, \
ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
RET
-cglobal vp9_%1_%4_16x16_add_12, 3, 6 + ARCH_X86_64, 16, \
+cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
dst, stride, block, eob
mova m0, [pw_4095]
@@ -1427,9 +1565,9 @@ cglobal vp9_%1_%4_16x16_add_12, 3, 6 + ARCH_X86_64, 16, \
%endmacro
INIT_XMM sse2
-IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70
-IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67
-IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70
+IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row
+IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col
+IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default
%macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride
IDCT16_1D %2, 2 * %3, 272, 257
@@ -1808,12 +1946,19 @@ cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
.idctfull:
mova [rsp+256*mmsize], m0
- DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
%if ARCH_X86_64
mov dstbakq, dstq
+ movsxd cntq, cntd
%endif
- lea stride3q, [strideq*3]
- mov cntd, 8
+%ifdef PIC
+ lea ptrq, [default_32x32]
+ movzx cntd, byte [ptrq+cntq-1]
+%else
+ movzx cntd, byte [default_32x32+cntq-1]
+%endif
+ mov skipd, 8
+ sub skipd, cntd
mov ptrq, rsp
.loop_1:
IDCT32_1D 1, blockq
@@ -1823,6 +1968,28 @@ cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
dec cntd
jg .loop_1
+ ; zero-pad the remainder (skipped cols)
+ test skipd, skipd
+ jz .end
+ shl skipd, 2
+ lea blockq, [blockq+skipq*(mmsize/4)]
+ pxor m0, m0
+.loop_z:
+ mova [ptrq+mmsize*0], m0
+ mova [ptrq+mmsize*1], m0
+ mova [ptrq+mmsize*2], m0
+ mova [ptrq+mmsize*3], m0
+ mova [ptrq+mmsize*4], m0
+ mova [ptrq+mmsize*5], m0
+ mova [ptrq+mmsize*6], m0
+ mova [ptrq+mmsize*7], m0
+ add ptrq, 8 * mmsize
+ dec skipd
+ jg .loop_z
+.end:
+
+ DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
+ lea stride3q, [strideq*3]
mov cntd, 8
mov ptrq, rsp
.loop_2:
diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c
index 37a3ca6738..c1e13764e2 100644
--- a/tests/checkasm/vp9dsp.c
+++ b/tests/checkasm/vp9dsp.c
@@ -337,7 +337,7 @@ static void check_itxfm(void)
randomize_buffers();
ftx(coef, tx, txtp, sz, bit_depth);
- for (sub = (txtp == 0) ? 1 : sz; sub <= sz; sub <<= 1) {
+ for (sub = (txtp == 0) ? 1 : 2; sub <= sz; sub <<= 1) {
int eob;
if (sub < sz) {