From bff7242608409dc52bf2fd51a67bb9d5f171a0ab Mon Sep 17 00:00:00 2001 From: James Darnley Date: Tue, 14 Jul 2015 23:48:47 +0000 Subject: avfilter/vf_removegrain: add x86 and x86_64 SSE2 functions Speed of all modes increased by a factor between 7.4 and 19.8 largely depending on whether bytes are unpacked into words. Modes 2, 3, and 4 have been sped-up by a factor of 43 (thanks quick sort!) All modes are available on x86_64 but only modes 1, 10, 11, 12, 13, 14, 19, 20, 21, and 22 are available on x86 due to the number of SIMD registers used. With a contribution from James Almer --- libavfilter/vf_removegrain.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'libavfilter/vf_removegrain.c') diff --git a/libavfilter/vf_removegrain.c b/libavfilter/vf_removegrain.c index 77b35617cc..da17f6a5ad 100644 --- a/libavfilter/vf_removegrain.c +++ b/libavfilter/vf_removegrain.c @@ -2,6 +2,7 @@ * Copyright (c) 2012 Laurent de Soras * Copyright (c) 2013 Fredrik Mellbin * Copyright (c) 2015 Paul B Mahol + * Copyright (c) 2015 James Darnley * * This file is part of FFmpeg. * @@ -20,32 +21,15 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -/* - * TODO: add SIMD - */ - #include "libavutil/imgutils.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" #include "avfilter.h" #include "formats.h" #include "internal.h" +#include "removegrain.h" #include "video.h" -typedef struct RemoveGrainContext { - const AVClass *class; - - int mode[4]; - - int nb_planes; - int planewidth[4]; - int planeheight[4]; - int skip_even; - int skip_odd; - - int (*rg[4])(int c, int a1, int a2, int a3, int a4, int a5, int a6, int a7, int a8); -} RemoveGrainContext; - #define OFFSET(x) offsetof(RemoveGrainContext, x) #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM @@ -142,6 +126,7 @@ static int mode05(int c, int a1, int a2, int a3, int a4, int a5, int a6, int a7, const int mindiff = FFMIN(FFMIN(c1, c2), FFMIN(c3, c4)); + /* When adding SIMD notice the return order here: 4, 2, 3, 1. */ if (mindiff == c4) { return av_clip(c, mi4, ma4); } else if (mindiff == c2) { @@ -524,6 +509,9 @@ static int config_input(AVFilterLink *inlink) } } + if (ARCH_X86) + ff_removegrain_init_x86(s); + return 0; } @@ -566,7 +554,19 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) } *dst++ = *src++; - for (x = 1; x < s->planewidth[i] - 1; x++) { + + if (s->fl[i]) { + int w_asm = (s->planewidth[i] - 2) & ~15; + + s->fl[i](dst, src, in->linesize[i], w_asm); + + x = 1 + w_asm; + dst += w_asm; + src += w_asm; + } else + x = 1; + + for (; x < s->planewidth[i] - 1; x++) { const int a1 = src[-op]; const int a2 = src[-o0]; const int a3 = src[-om]; -- cgit v1.2.3