From 43d16aef6395340c248ee79d35f60f9965427a45 Mon Sep 17 00:00:00 2001 From: Clément Bœsch Date: Sun, 6 May 2018 12:34:54 +0200 Subject: lavfi/nlmeans: make compute_safe_ssd_integral_image_c faster before: ssd_integral_image_c: 49204.6 after: ssd_integral_image_c: 44272.8 Unrolling by 4 made the biggest difference on odroid-c2 (aarch64); unrolling by 2 or 8 both raised 46k cycles vs 44k for 4. Additionally, this is a much better reference when writing SIMD (SIMD vectorization will just target 16 instead of 4). --- libavfilter/vf_nlmeans.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'libavfilter/vf_nlmeans.c') diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c index c30e44498f..f37f1183f7 100644 --- a/libavfilter/vf_nlmeans.c +++ b/libavfilter/vf_nlmeans.c @@ -146,10 +146,6 @@ static inline int get_integral_patch_value(const uint32_t *ii, int ii_lz_32, int * function, we do not need any clipping here. * * The line above dst and the column to its left are always readable. - * - * This C version computes the SSD integral image using a scalar accumulator, - * while for SIMD implementation it is likely more interesting to use the - * two-loops algorithm variant. */ static void compute_safe_ssd_integral_image_c(uint32_t *dst, ptrdiff_t dst_linesize_32, const uint8_t *s1, ptrdiff_t linesize1, @@ -157,21 +153,32 @@ static void compute_safe_ssd_integral_image_c(uint32_t *dst, ptrdiff_t dst_lines int w, int h) { int x, y; + const uint32_t *dst_top = dst - dst_linesize_32; /* SIMD-friendly assumptions allowed here */ av_assert2(!(w & 0xf) && w >= 16 && h >= 1); for (y = 0; y < h; y++) { - uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1]; - - for (x = 0; x < w; x++) { - const int d = s1[x] - s2[x]; - acc += d * d; - dst[x] = dst[-dst_linesize_32 + x] + acc; + for (x = 0; x < w; x += 4) { + const int d0 = s1[x ] - s2[x ]; + const int d1 = s1[x + 1] - s2[x + 1]; + const int d2 = s1[x + 2] - s2[x + 2]; + const int d3 = s1[x + 3] - s2[x + 3]; + + dst[x ] = dst_top[x ] - dst_top[x - 1] + d0*d0; + dst[x + 1] = dst_top[x + 1] - dst_top[x ] + d1*d1; + dst[x + 2] = dst_top[x + 2] - dst_top[x + 1] + d2*d2; + dst[x + 3] = dst_top[x + 3] - dst_top[x + 2] + d3*d3; + + dst[x ] += dst[x - 1]; + dst[x + 1] += dst[x ]; + dst[x + 2] += dst[x + 1]; + dst[x + 3] += dst[x + 2]; } s1 += linesize1; s2 += linesize2; dst += dst_linesize_32; + dst_top += dst_linesize_32; } } -- cgit v1.2.3