summaryrefslogtreecommitdiff
path: root/libavcodec/x86/rv40dsp.asm
diff options
context:
space:
mode:
authorChristophe GISQUET <christophe.gisquet@gmail.com>2012-03-19 22:46:28 +0100
committerRonald S. Bultje <rsbultje@gmail.com>2012-04-10 10:06:48 -0700
commit272b252c0110225188c7d7f31167941210aac197 (patch)
tree47bea5996c88057a418e8872a655bac8f261736e /libavcodec/x86/rv40dsp.asm
parentd3c59d5003a483f1a23e225fc71c19bd1116d11c (diff)
rv40dsp: implement prescaled versions for biweight.
Quite often, the original weights are multiple of 512. By prescaling them by 1/512 when they are computed (once per frame), no intermediate shifting is needed, and no prescaling on each call either. The x86 code already used that trick. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'libavcodec/x86/rv40dsp.asm')
-rw-r--r--libavcodec/x86/rv40dsp.asm70
1 files changed, 31 insertions, 39 deletions
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index bff3e7b96a..9028e74024 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -139,69 +139,61 @@ SECTION .text
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
; %1=size %2=num of xmm regs
-%macro RV40_WEIGHT 2
-cglobal rv40_weight_func_%1, 6, 7, %2
+; The weights are FP0.14 notation of fractions depending on pts.
+; For timebases without rounding error (i.e. PAL), the fractions
+; can be simplified, and several operations can be avoided.
+; Therefore, we check here whether they are multiples of 2^9 for
+; those simplifications to occur.
+%macro RV40_WEIGHT 3
+cglobal rv40_weight_func_%1_%2, 6, 7, %3
%if cpuflag(ssse3)
mova m1, [shift_round]
%else
mova m1, [pw_16]
%endif
pxor m0, m0
- mov r6, r3
- or r6, r4
- ; The weights are FP0.14 notation of fractions depending on pts.
- ; For timebases without rounding error (i.e. PAL), the fractions
- ; can be simplified, and several operations can be avoided.
- ; Therefore, we check here whether they are multiples of 2^9 for
- ; those simplifications to occur.
- and r6, 0x1FF
; Set loop counter and increments
%if mmsize == 8
- mov r6, %1
+ mov r6, %2
%else
- mov r6, (%1 * %1) / mmsize
+ mov r6, (%2 * %2) / mmsize
%endif
- ; Use result of test now
- jz .loop_512
movd m2, r3
movd m3, r4
+%ifidn %1,rnd
+%define RND 0
SPLATW m2, m2
- SPLATW m3, m3
-
-.loop:
- MAIN_LOOP %1, 0
- jnz .loop
- REP_RET
-
- ; Weights are multiple of 512, which allows some shortcuts
-.loop_512:
- sar r3, 9
- sar r4, 9
- movd m2, r3
- movd m3, r4
+%else
+%define RND 1
%if cpuflag(ssse3)
punpcklbw m3, m2
- SPLATW m3, m3
%else
SPLATW m2, m2
- SPLATW m3, m3
%endif
-.loop2:
- MAIN_LOOP %1, 1
- jnz .loop2
- REP_RET
+%endif
+ SPLATW m3, m3
+.loop:
+ MAIN_LOOP %2, RND
+ jnz .loop
+ REP_RET
%endmacro
INIT_MMX mmx
-RV40_WEIGHT 8, 0
-RV40_WEIGHT 16, 0
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
INIT_XMM sse2
-RV40_WEIGHT 8, 8
-RV40_WEIGHT 16, 8
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
INIT_XMM ssse3
-RV40_WEIGHT 8, 8
-RV40_WEIGHT 16, 8
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4