summaryrefslogtreecommitdiff
path: root/libavcodec/x86/vc1dsp_init.c
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2014-04-11 21:02:08 +0200
committerMichael Niedermayer <michaelni@gmx.at>2014-04-20 18:25:36 +0200
commit319235c67c59d6abaa78d5af57121ab9816f937d (patch)
tree4025d69a59c3e70b8cad858d486983b747671236 /libavcodec/x86/vc1dsp_init.c
parentde9cd5884822375d492ff4dcc98e55317a66c196 (diff)
vc1dsp: introduce cases for 8x8 and 16x16
This allows further unrolling the DSP implementation where possible. x86 and ARM DSP modified by simply moving the multiple calls from vc1dec to the DSP code. Decoding improvements should only occurs because of the compiler actually able to unroll more. Decoding time: ~8.80s -> 8.64s (ie around 2%) Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/vc1dsp_init.c')
-rw-r--r--libavcodec/x86/vc1dsp_init.c22
1 files changed, 21 insertions, 1 deletions
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 9256be3816..d81c28451e 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -64,11 +64,28 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
}
+static void avg_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride, int rnd)
+{
+ ff_avg_pixels8_mmx(dst, src, stride, 8);
+}
static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride, int rnd)
{
ff_avg_pixels8_mmxext(dst, src, stride, 8);
}
+
+static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride, int rnd)
+{
+ ff_avg_pixels16_mmx(dst, src, stride, 16);
+}
+static void avg_vc1_mspel_mc00_16_sse2(uint8_t *dst, const uint8_t *src,
+ ptrdiff_t stride, int rnd)
+{
+ ff_avg_pixels16_sse2(dst, src, stride, 16);
+}
+
#endif /* HAVE_YASM */
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
@@ -104,6 +121,8 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
#if HAVE_YASM
if (EXTERNAL_MMX(cpu_flags)) {
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
+ dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmx;
+ dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
@@ -112,13 +131,14 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
ASSIGN_LF(mmxext);
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
- dsp->avg_vc1_mspel_pixels_tab[0] = avg_vc1_mspel_mc00_mmxext;
+ dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
+ dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
ASSIGN_LF(ssse3);