From 83f9da77684e7ea0d8e9f9712ec716424140043a Mon Sep 17 00:00:00 2001 From: Ruiling Song Date: Wed, 15 May 2019 17:54:10 +0800 Subject: avfilter/vf_gblur: add x86 SIMD optimizations The horizontal pass get ~2x performance with the patch under single thread. Tested overall performance using the command(avx2 enabled): ./ffmpeg -i 1080p.mp4 -vf gblur -f null /dev/null ./ffmpeg -i 1080p.mp4 -vf gblur=threads=1 -f null /dev/null For single thread, the fps improves from 43 to 60, about 40%. For multi-thread, the fps improves from 110 to 130, about 20%. Signed-off-by: Ruiling Song --- libavfilter/vf_gblur.c | 71 +++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 39 deletions(-) (limited to 'libavfilter/vf_gblur.c') diff --git a/libavfilter/vf_gblur.c b/libavfilter/vf_gblur.c index b91a8c074a..e71b33da80 100644 --- a/libavfilter/vf_gblur.c +++ b/libavfilter/vf_gblur.c @@ -30,30 +30,10 @@ #include "libavutil/pixdesc.h" #include "avfilter.h" #include "formats.h" +#include "gblur.h" #include "internal.h" #include "video.h" -typedef struct GBlurContext { - const AVClass *class; - - float sigma; - float sigmaV; - int steps; - int planes; - - int depth; - int planewidth[4]; - int planeheight[4]; - float *buffer; - float boundaryscale; - float boundaryscaleV; - float postscale; - float postscaleV; - float nu; - float nuV; - int nb_planes; -} GBlurContext; - #define OFFSET(x) offsetof(GBlurContext, x) #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM @@ -72,39 +52,44 @@ typedef struct ThreadData { int width; } ThreadData; -static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +static void horiz_slice_c(float *buffer, int width, int height, int steps, + float nu, float bscale) { - GBlurContext *s = ctx->priv; - ThreadData *td = arg; - const int height = td->height; - const int width = td->width; - const int slice_start = (height * jobnr ) / nb_jobs; - const int slice_end = (height * (jobnr+1)) / nb_jobs; - const float boundaryscale = s->boundaryscale; - const int steps = s->steps; - const float nu = s->nu; - float *buffer = s->buffer; - int y, x, step; + int step, x, y; float *ptr; - - /* Filter horizontally along each row */ - for (y = slice_start; y < slice_end; y++) { + for (y = 0; y < height; y++) { for (step = 0; step < steps; step++) { ptr = buffer + width * y; - ptr[0] *= boundaryscale; + ptr[0] *= bscale; /* Filter rightwards */ for (x = 1; x < width; x++) ptr[x] += nu * ptr[x - 1]; - - ptr[x = width - 1] *= boundaryscale; + ptr[x = width - 1] *= bscale; /* Filter leftwards */ for (; x > 0; x--) ptr[x - 1] += nu * ptr[x]; } } +} +static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) +{ + GBlurContext *s = ctx->priv; + ThreadData *td = arg; + const int height = td->height; + const int width = td->width; + const int slice_start = (height * jobnr ) / nb_jobs; + const int slice_end = (height * (jobnr+1)) / nb_jobs; + const float boundaryscale = s->boundaryscale; + const int steps = s->steps; + const float nu = s->nu; + float *buffer = s->buffer; + + s->horiz_slice(buffer + width * slice_start, width, slice_end - slice_start, + steps, nu, boundaryscale); + emms_c(); return 0; } @@ -231,6 +216,13 @@ static int query_formats(AVFilterContext *ctx) return ff_set_common_formats(ctx, ff_make_format_list(pix_fmts)); } +void ff_gblur_init(GBlurContext *s) +{ + s->horiz_slice = horiz_slice_c; + if (ARCH_X86_64) + ff_gblur_init_x86(s); +} + static int config_input(AVFilterLink *inlink) { const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format); @@ -251,6 +243,7 @@ static int config_input(AVFilterLink *inlink) if (s->sigmaV < 0) { s->sigmaV = s->sigma; } + ff_gblur_init(s); return 0; } -- cgit v1.2.3