summaryrefslogtreecommitdiff
path: root/libavfilter/vf_gblur.c
diff options
context:
space:
mode:
authorRuiling Song <ruiling.song@intel.com>2019-05-15 17:54:10 +0800
committerRuiling Song <ruiling.song@intel.com>2019-06-12 08:53:11 +0800
commit83f9da77684e7ea0d8e9f9712ec716424140043a (patch)
tree36264e4571080e5cdd0d9c9eb649d6c31a9c1cb7 /libavfilter/vf_gblur.c
parent5fc8d87ba6954d3917a9095fb3b8c1d8caf0b0f4 (diff)
avfilter/vf_gblur: add x86 SIMD optimizations
The horizontal pass get ~2x performance with the patch under single thread. Tested overall performance using the command(avx2 enabled): ./ffmpeg -i 1080p.mp4 -vf gblur -f null /dev/null ./ffmpeg -i 1080p.mp4 -vf gblur=threads=1 -f null /dev/null For single thread, the fps improves from 43 to 60, about 40%. For multi-thread, the fps improves from 110 to 130, about 20%. Signed-off-by: Ruiling Song <ruiling.song@intel.com>
Diffstat (limited to 'libavfilter/vf_gblur.c')
-rw-r--r--libavfilter/vf_gblur.c71
1 files changed, 32 insertions, 39 deletions
diff --git a/libavfilter/vf_gblur.c b/libavfilter/vf_gblur.c
index b91a8c074a..e71b33da80 100644
--- a/libavfilter/vf_gblur.c
+++ b/libavfilter/vf_gblur.c
@@ -30,30 +30,10 @@
#include "libavutil/pixdesc.h"
#include "avfilter.h"
#include "formats.h"
+#include "gblur.h"
#include "internal.h"
#include "video.h"
-typedef struct GBlurContext {
- const AVClass *class;
-
- float sigma;
- float sigmaV;
- int steps;
- int planes;
-
- int depth;
- int planewidth[4];
- int planeheight[4];
- float *buffer;
- float boundaryscale;
- float boundaryscaleV;
- float postscale;
- float postscaleV;
- float nu;
- float nuV;
- int nb_planes;
-} GBlurContext;
-
#define OFFSET(x) offsetof(GBlurContext, x)
#define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
@@ -72,39 +52,44 @@ typedef struct ThreadData {
int width;
} ThreadData;
-static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+static void horiz_slice_c(float *buffer, int width, int height, int steps,
+ float nu, float bscale)
{
- GBlurContext *s = ctx->priv;
- ThreadData *td = arg;
- const int height = td->height;
- const int width = td->width;
- const int slice_start = (height * jobnr ) / nb_jobs;
- const int slice_end = (height * (jobnr+1)) / nb_jobs;
- const float boundaryscale = s->boundaryscale;
- const int steps = s->steps;
- const float nu = s->nu;
- float *buffer = s->buffer;
- int y, x, step;
+ int step, x, y;
float *ptr;
-
- /* Filter horizontally along each row */
- for (y = slice_start; y < slice_end; y++) {
+ for (y = 0; y < height; y++) {
for (step = 0; step < steps; step++) {
ptr = buffer + width * y;
- ptr[0] *= boundaryscale;
+ ptr[0] *= bscale;
/* Filter rightwards */
for (x = 1; x < width; x++)
ptr[x] += nu * ptr[x - 1];
-
- ptr[x = width - 1] *= boundaryscale;
+ ptr[x = width - 1] *= bscale;
/* Filter leftwards */
for (; x > 0; x--)
ptr[x - 1] += nu * ptr[x];
}
}
+}
+static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+ GBlurContext *s = ctx->priv;
+ ThreadData *td = arg;
+ const int height = td->height;
+ const int width = td->width;
+ const int slice_start = (height * jobnr ) / nb_jobs;
+ const int slice_end = (height * (jobnr+1)) / nb_jobs;
+ const float boundaryscale = s->boundaryscale;
+ const int steps = s->steps;
+ const float nu = s->nu;
+ float *buffer = s->buffer;
+
+ s->horiz_slice(buffer + width * slice_start, width, slice_end - slice_start,
+ steps, nu, boundaryscale);
+ emms_c();
return 0;
}
@@ -231,6 +216,13 @@ static int query_formats(AVFilterContext *ctx)
return ff_set_common_formats(ctx, ff_make_format_list(pix_fmts));
}
+void ff_gblur_init(GBlurContext *s)
+{
+ s->horiz_slice = horiz_slice_c;
+ if (ARCH_X86_64)
+ ff_gblur_init_x86(s);
+}
+
static int config_input(AVFilterLink *inlink)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
@@ -251,6 +243,7 @@ static int config_input(AVFilterLink *inlink)
if (s->sigmaV < 0) {
s->sigmaV = s->sigma;
}
+ ff_gblur_init(s);
return 0;
}