From 716b39674059d5b416faef92afd41654a6d9469b Mon Sep 17 00:00:00 2001 From: Mark Reid Date: Tue, 5 Oct 2021 20:58:30 -0700 Subject: avfilter/vf_lut3d: add x86-optimized tetrahedral interpolation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I spotted an interesting pattern that I didn't see before that leads to the implementation being faster. The bit shifting table I was using before is no longer needed, and was able to remove quite a few lines.  I also add use of FMA on the AVX2 version. f32 1920x1080 1 thread with prelut c impl 1434012700 UNITS in lut3d->interp,       1 runs,      0 skips 1434035335 UNITS in lut3d->interp,       2 runs,      0 skips 1423615347 UNITS in lut3d->interp,       4 runs,      0 skips 1426268863 UNITS in lut3d->interp,       8 runs,      0 skips sse2 905484420 UNITS in lut3d->interp,       1 runs,      0 skips 905659010 UNITS in lut3d->interp,       2 runs,      0 skips 915167140 UNITS in lut3d->interp,       4 runs,      0 skips 915834222 UNITS in lut3d->interp,       8 runs,      0 skips avx 574794860 UNITS in lut3d->interp,       1 runs,      0 skips 581035090 UNITS in lut3d->interp,       2 runs,      0 skips 584116720 UNITS in lut3d->interp,       4 runs,      0 skips 581460290 UNITS in lut3d->interp,       8 runs,      0 skips avx2 301698880 UNITS in lut3d->interp,       1 runs,      0 skips 301982880 UNITS in lut3d->interp,       2 runs,      0 skips 306962430 UNITS in lut3d->interp,       4 runs,      0 skips 305472025 UNITS in lut3d->interp,       8 runs,      0 skips gbrap16 1920x1080 1 thread with prelut c impl 1480894840 UNITS in lut3d->interp,       1 runs,      0 skips 1502922990 UNITS in lut3d->interp,       2 runs,      0 skips 1496114307 UNITS in lut3d->interp,       4 runs,      0 skips 1492554551 UNITS in lut3d->interp,       8 runs,      0 skips sse2 980777180 UNITS in lut3d->interp,       1 runs,      0 skips 986121520 UNITS in lut3d->interp,       2 runs,      0 skips 986489840 UNITS in lut3d->interp,       4 runs,      0 skips 998832248 UNITS in lut3d->interp,       8 runs,      0 skips avx 622212360 UNITS in lut3d->interp,       1 runs,      0 skips 622981160 UNITS in lut3d->interp,       2 runs,      0 skips 645396315 UNITS in lut3d->interp,       4 runs,      0 skips 641057075 UNITS in lut3d->interp,       8 runs,      0 skips avx2 321336400 UNITS in lut3d->interp,       1 runs,      0 skips 321268920 UNITS in lut3d->interp,       2 runs,      0 skips 323459895 UNITS in lut3d->interp,       4 runs,      0 skips 324949967 UNITS in lut3d->interp,       8 runs,      0 skips --- libavfilter/vf_lut3d.c | 61 +++++--------------------------------------------- 1 file changed, 5 insertions(+), 56 deletions(-) (limited to 'libavfilter/vf_lut3d.c') diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c index 8ec07f8ab0..7ef96906fc 100644 --- a/libavfilter/vf_lut3d.c +++ b/libavfilter/vf_lut3d.c @@ -31,73 +31,18 @@ #include "libavutil/intreadwrite.h" #include "libavutil/intfloat.h" #include "libavutil/avassert.h" -#include "libavutil/pixdesc.h" #include "libavutil/avstring.h" -#include "avfilter.h" #include "drawutils.h" #include "formats.h" -#include "framesync.h" #include "internal.h" #include "video.h" +#include "lut3d.h" #define R 0 #define G 1 #define B 2 #define A 3 -enum interp_mode { - INTERPOLATE_NEAREST, - INTERPOLATE_TRILINEAR, - INTERPOLATE_TETRAHEDRAL, - INTERPOLATE_PYRAMID, - INTERPOLATE_PRISM, - NB_INTERP_MODE -}; - -struct rgbvec { - float r, g, b; -}; - -/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT - * of 512x512 (64x64x64) */ -#define MAX_LEVEL 256 -#define PRELUT_SIZE 65536 - -typedef struct Lut3DPreLut { - int size; - float min[3]; - float max[3]; - float scale[3]; - float* lut[3]; -} Lut3DPreLut; - -typedef struct LUT3DContext { - const AVClass *class; - int interpolation; ///