diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2014-04-11 21:02:08 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-04-20 18:25:36 +0200 |
commit | 319235c67c59d6abaa78d5af57121ab9816f937d (patch) | |
tree | 4025d69a59c3e70b8cad858d486983b747671236 /libavcodec/x86/vc1dsp_mmx.c | |
parent | de9cd5884822375d492ff4dcc98e55317a66c196 (diff) |
vc1dsp: introduce cases for 8x8 and 16x16
This allows further unrolling the DSP implementation where possible.
x86 and ARM DSP modified by simply moving the multiple calls from vc1dec
to the DSP code. Decoding improvements should only occurs because of the
compiler actually able to unroll more.
Decoding time: ~8.80s -> 8.64s (ie around 2%)
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/vc1dsp_mmx.c')
-rw-r--r-- | libavcodec/x86/vc1dsp_mmx.c | 106 |
1 files changed, 69 insertions, 37 deletions
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c index c06cb14496..d9108fc832 100644 --- a/libavcodec/x86/vc1dsp_mmx.c +++ b/libavcodec/x86/vc1dsp_mmx.c @@ -462,6 +462,15 @@ static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ \ /* Horizontal mode with no vertical mode */\ vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ +} \ +static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ + int stride, int hmode, int vmode, int rnd)\ +{ \ + OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ + OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ + dst += 8*stride; src += 8*stride; \ + OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ + OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ } VC1_MSPEL_MC(put_) @@ -482,6 +491,20 @@ static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \ int rnd) \ { \ avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ +}\ +static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \ + const uint8_t *src, \ + ptrdiff_t stride, \ + int rnd) \ +{ \ + put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ +}\ +static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \ + const uint8_t *src,\ + ptrdiff_t stride, \ + int rnd) \ +{ \ + avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ } DECLARE_FUNCTION(0, 1) @@ -710,50 +733,59 @@ static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, { ff_put_pixels8_mmx(dst, src, stride, 8); } +static void put_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride, int rnd) +{ + ff_put_pixels16_mmx(dst, src, stride, 16); +} + +#define FN_ASSIGN(OP, X, Y, INSN) \ + dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \ + dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) { - dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx; - dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; - dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; - dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx; - dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx; - dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx; - dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx; - dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx; - dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx; - dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; - dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; - dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; - dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; + FN_ASSIGN(put_, 0, 0, _mmx); + FN_ASSIGN(put_, 0, 1, _mmx); + FN_ASSIGN(put_, 0, 2, _mmx); + FN_ASSIGN(put_, 0, 3, _mmx); + + FN_ASSIGN(put_, 1, 0, _mmx); + FN_ASSIGN(put_, 1, 1, _mmx); + FN_ASSIGN(put_, 1, 2, _mmx); + FN_ASSIGN(put_, 1, 3, _mmx); + + FN_ASSIGN(put_, 2, 0, _mmx); + FN_ASSIGN(put_, 2, 1, _mmx); + FN_ASSIGN(put_, 2, 2, _mmx); + FN_ASSIGN(put_, 2, 3, _mmx); + + FN_ASSIGN(put_, 3, 0, _mmx); + FN_ASSIGN(put_, 3, 1, _mmx); + FN_ASSIGN(put_, 3, 2, _mmx); + FN_ASSIGN(put_, 3, 3, _mmx); } av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) { - dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext; - dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext; - - dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext; - dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext; - - dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext; - dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext; - dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext; - - dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext; - dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext; - dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext; - dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext; + FN_ASSIGN(avg_, 0, 1, _mmxext); + FN_ASSIGN(avg_, 0, 2, _mmxext); + FN_ASSIGN(avg_, 0, 3, _mmxext); + + FN_ASSIGN(avg_, 1, 0, _mmxext); + FN_ASSIGN(avg_, 1, 1, _mmxext); + FN_ASSIGN(avg_, 1, 2, _mmxext); + FN_ASSIGN(avg_, 1, 3, _mmxext); + + FN_ASSIGN(avg_, 2, 0, _mmxext); + FN_ASSIGN(avg_, 2, 1, _mmxext); + FN_ASSIGN(avg_, 2, 2, _mmxext); + FN_ASSIGN(avg_, 2, 3, _mmxext); + + FN_ASSIGN(avg_, 3, 0, _mmxext); + FN_ASSIGN(avg_, 3, 1, _mmxext); + FN_ASSIGN(avg_, 3, 2, _mmxext); + FN_ASSIGN(avg_, 3, 3, _mmxext); dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext; dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext; |