summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2015-10-12 19:37:46 +0200
committerMichael Niedermayer <michael@niedermayer.cc>2015-10-13 12:51:10 +0200
commite652f69b354bc6b5819012979985794cfd2805c9 (patch)
treee4a5f232f369360fa587de0d9ccc64692004a1fb
parent3b336ec2fbd4b9e16144d3247428009c6fb301f0 (diff)
x86: simple_idct10_template: fix overflow in pass
When the input of a pass has 15 or 16 bits of precision (in particular the column pass), the addition of a bias to W4 may lead to overflows in the input to pmaddwd. This requires postponing the adding of the bias to after the first butterfly. To do so, the fact that m15, unused although zeroed, is exploited. In case the pass is safe, an address can be directly used, and the number of xmm regs can be decreased. Otherwise, the 32bits bias is loaded into it. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
-rw-r--r--libavcodec/x86/proresdsp.asm8
-rw-r--r--libavcodec/x86/simple_idct10_template.asm13
2 files changed, 16 insertions, 5 deletions
diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index 18cf15b3ca..3fb71badba 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -37,17 +37,17 @@ cextern pw_1019
section .text align=16
-%macro idct_put_fn 1
-cglobal prores_idct_put_10, 4, 4, %1
+%macro idct_put_fn 0
+cglobal prores_idct_put_10, 4, 4, 15
IDCT_PUT_FN pw_1, 15, pw_88, 18, pw_4, pw_1019, r3
RET
%endmacro
INIT_XMM sse2
-idct_put_fn 16
+idct_put_fn
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
-idct_put_fn 16
+idct_put_fn
%endif
%endif
diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm
index 968d280ba3..e46c83f50c 100644
--- a/libavcodec/x86/simple_idct10_template.asm
+++ b/libavcodec/x86/simple_idct10_template.asm
@@ -75,6 +75,7 @@ cextern w7_min_w5
; a2 -= W6 * row[2];
; a3 -= W2 * row[2];
%ifstr %1
+ mova m15, [pd_round_ %+ %2]
%else
paddw m10, [%1]
%endif
@@ -87,6 +88,17 @@ cextern w7_min_w5
pmaddwd m7, m1, [w4_min_w2]
pmaddwd m0, [w4_plus_w2]
pmaddwd m1, [w4_plus_w2]
+%ifstr %1
+ ; Adding 1<<(%2-1) for >=15 bits values
+ paddd m2, m15
+ paddd m3, m15
+ paddd m4, m15
+ paddd m5, m15
+ paddd m6, m15
+ paddd m7, m15
+ paddd m0, m15
+ paddd m1, m15
+%endif
; a0: -1*row[0]-1*row[2]
; a1: -1*row[0]
@@ -225,7 +237,6 @@ cextern w7_min_w5
%macro IDCT_PUT_FN 6-7
movsxd r1, r1d
- pxor m15, m15 ; zero
; for (i = 0; i < 8; i++)
; idctRowCondDC(block + i*8);