From a7878c9f73c12cfa685bd8af8f3afcca85f56a8b Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Tue, 1 Feb 2011 22:38:15 +0000 Subject: VP8: ARM optimised decode_block_coeffs_internal Approximately 5% faster on Cortex-A8. Signed-off-by: Mans Rullgard --- libavcodec/arm/Makefile | 2 + libavcodec/arm/vp8.h | 29 ++++++ libavcodec/arm/vp8_armv6.S | 220 +++++++++++++++++++++++++++++++++++++++++++++ libavcodec/vp8.c | 8 +- libavcodec/vp8data.h | 2 +- 5 files changed, 259 insertions(+), 2 deletions(-) create mode 100644 libavcodec/arm/vp8.h create mode 100644 libavcodec/arm/vp8_armv6.S (limited to 'libavcodec') diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 15269ea676..d223703cfe 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -3,6 +3,7 @@ OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \ OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_init_arm.o OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_init_arm.o OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o +ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o @@ -23,6 +24,7 @@ OBJS-$(HAVE_ARMV5TE) += arm/dsputil_init_armv5te.o \ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \ arm/dsputil_armv6.o \ arm/simple_idct_armv6.o \ + $(ARMV6-OBJS-yes) VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \ diff --git a/libavcodec/arm/vp8.h b/libavcodec/arm/vp8.h new file mode 100644 index 0000000000..90e7344b62 --- /dev/null +++ b/libavcodec/arm/vp8.h @@ -0,0 +1,29 @@ +/** + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_VP8_H +#define AVCODEC_ARM_VP8_H + +#if HAVE_ARMV6 +#define decode_block_coeffs_internal ff_decode_block_coeffs_armv6 +int ff_decode_block_coeffs_armv6(VP56RangeCoder *rc, DCTELEM block[16], + uint8_t probs[8][3][NUM_DCT_TOKENS-1], + int i, uint8_t *token_prob, int16_t qmul[2]); +#endif + +#endif diff --git a/libavcodec/arm/vp8_armv6.S b/libavcodec/arm/vp8_armv6.S new file mode 100644 index 0000000000..54c036b82a --- /dev/null +++ b/libavcodec/arm/vp8_armv6.S @@ -0,0 +1,220 @@ +/** + * Copyright (C) 2010 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + .syntax unified + +.macro rac_get_prob h, bs, buf, cw, pr, t0, t1 + adds \bs, \bs, \t0 + lsl \cw, \cw, \t0 + lsl \t0, \h, \t0 + rsb \h, \pr, #256 + ldrhcs \t1, [\buf], #2 + smlabb \h, \t0, \pr, \h + rev16cs \t1, \t1 + orrcs \cw, \cw, \t1, lsl \bs + subcs \bs, \bs, #16 + lsr \h, \h, #8 + cmp \cw, \h, lsl #16 + subge \cw, \cw, \h, lsl #16 + subge \h, \t0, \h +.endm + +.macro rac_get_128 h, bs, buf, cw, t0, t1 + adds \bs, \bs, \t0 + lsl \cw, \cw, \t0 + lsl \t0, \h, \t0 + ldrhcs \t1, [\buf], #2 + mov \h, #128 + rev16cs \t1, \t1 + add \h, \h, \t0, lsl #7 + orrcs \cw, \cw, \t1, lsl \bs + subcs \bs, \bs, #16 + lsr \h, \h, #8 + cmp \cw, \h, lsl #16 + subge \cw, \cw, \h, lsl #16 + subge \h, \t0, \h +.endm + +function ff_decode_block_coeffs_armv6, export=1 + push {r0,r1,r4-r11,lr} + movrel lr, ff_vp56_norm_shift + ldrd r4, r5, [sp, #44] @ token_prob, qmul + cmp r3, #0 + ldr r11, [r5] + ldm r0, {r5-r7} @ high, bits, buf + pkhtbne r11, r11, r11, asr #16 + ldr r8, [r0, #16] @ code_word +0: + ldrb r9, [lr, r5] + add r3, r3, #1 + ldrb r0, [r4, #1] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + blt 2f + + ldrb r9, [lr, r5] + ldrb r0, [r4, #2] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + ldrb r9, [lr, r5] + bge 3f + + add r4, r3, r3, lsl #5 + sxth r12, r11 + add r4, r2, r4 + adds r6, r6, r9 + add r4, r4, #11 + lsl r8, r8, r9 + ldrhcs r10, [r7], #2 + lsl r9, r5, r9 + mov r5, #128 + rev16cs r10, r10 + add r5, r5, r9, lsl #7 + orrcs r8, r8, r10, lsl r6 + subcs r6, r6, #16 + lsr r5, r5, #8 + cmp r8, r5, lsl #16 + movrel r10, zigzag_scan-1 + subge r8, r8, r5, lsl #16 + subge r5, r9, r5 + ldrb r10, [r10, r3] + rsbge r12, r12, #0 + cmp r3, #16 + strh r12, [r1, r10] + bge 6f +5: + ldrb r9, [lr, r5] + ldrb r0, [r4] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + pkhtb r11, r11, r11, asr #16 + bge 0b + +6: + ldr r0, [sp] + ldr r9, [r0, #12] + cmp r7, r9 + movhi r7, r9 + stm r0, {r5-r7} @ high, bits, buf + str r8, [r0, #16] @ code_word + + add sp, sp, #8 + mov r0, r3 + pop {r4-r11,pc} +2: + add r4, r3, r3, lsl #5 + cmp r3, #16 + add r4, r2, r4 + pkhtb r11, r11, r11, asr #16 + bne 0b + b 6b +3: + ldrb r0, [r4, #3] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + ldrb r9, [lr, r5] + bge 1f + + mov r12, #2 + ldrb r0, [r4, #4] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + addge r12, #1 + ldrb r9, [lr, r5] + blt 4f + ldrb r0, [r4, #5] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + addge r12, #1 + ldrb r9, [lr, r5] + b 4f +1: + ldrb r0, [r4, #6] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + ldrb r9, [lr, r5] + bge 3f + + ldrb r0, [r4, #7] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + ldrb r9, [lr, r5] + bge 2f + + mov r12, #5 + mov r0, #159 + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + addge r12, r12, #1 + ldrb r9, [lr, r5] + b 4f +2: + mov r12, #7 + mov r0, #165 + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + addge r12, r12, #2 + ldrb r9, [lr, r5] + mov r0, #145 + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + addge r12, r12, #1 + ldrb r9, [lr, r5] + b 4f +3: + ldrb r0, [r4, #8] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + addge r4, r4, #1 + ldrb r9, [lr, r5] + movge r12, #2 + movlt r12, #0 + ldrb r0, [r4, #9] + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + mov r9, #8 + addge r12, r12, #1 + movrel r4, ff_vp8_dct_cat_prob + lsl r9, r9, r12 + ldr r4, [r4, r12, lsl #2] + add r12, r9, #3 + mov r1, #0 + ldrb r0, [r4], #1 +1: + ldrb r9, [lr, r5] + lsl r1, r1, #1 + rac_get_prob r5, r6, r7, r8, r0, r9, r10 + ldrb r0, [r4], #1 + addge r1, r1, #1 + cmp r0, #0 + bne 1b + ldrb r9, [lr, r5] + add r12, r12, r1 + ldr r1, [sp, #4] +4: + add r4, r3, r3, lsl #5 + add r4, r2, r4 + add r4, r4, #22 + rac_get_128 r5, r6, r7, r8, r9, r10 + rsbge r12, r12, #0 + smulbb r12, r12, r11 + movrel r9, zigzag_scan-1 + ldrb r9, [r9, r3] + cmp r3, #16 + strh r12, [r1, r9] + bge 6b + b 5b +endfunc + + .section .rodata +zigzag_scan: + .byte 0, 2, 8, 16 + .byte 10, 4, 6, 12 + .byte 18, 24, 26, 20 + .byte 14, 22, 28, 30 diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index 8de8968d6e..3cd76249d0 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -30,6 +30,10 @@ #include "h264pred.h" #include "rectangle.h" +#if ARCH_ARM +# include "arm/vp8.h" +#endif + typedef struct { uint8_t filter_level; uint8_t inner_limit; @@ -801,6 +805,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_ } } +#ifndef decode_block_coeffs_internal /** * @param c arithmetic bitstream reader context * @param block destination for block coefficients @@ -854,7 +859,7 @@ skip_eob: int b = vp56_rac_get_prob(c, token_prob[9+a]); int cat = (a<<1) + b; coeff = 3 + (8<