From 2b5166addf9956f0617e6007bc02387cde9927dd Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Thu, 25 May 2023 23:06:50 +0200
Subject: avfilter/af_silenceremove: add real peak detector

Rename old peak detector to more correct name one.
---
 doc/filters.texi                     |  3 +-
 libavfilter/af_silenceremove.c       | 42 ++++++++++++++++-----
 libavfilter/silenceremove_template.c | 71 ++++++++++++++++++++++++++++++++----
 tests/fate/filter-audio.mak          |  2 +-
 4 files changed, 98 insertions(+), 20 deletions(-)

diff --git a/doc/filters.texi b/doc/filters.texi
index 47b26fe92f..6f15b54d10 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -6461,8 +6461,7 @@ With @var{all}, only if all channels are detected as non-silence will cause
 stopped trimming of silence.
 
 @item detection
-Set how is silence detected. Can be @code{rms} or @code{peak}. Second is faster
-and works better with digital silence which is exactly 0.
+Set how is silence detected. Can be @code{avg}, @code{rms} or @code{peak}.
 Default value is @code{rms}.
 
 @item window
diff --git a/libavfilter/af_silenceremove.c b/libavfilter/af_silenceremove.c
index e0592c2368..28c972f86f 100644
--- a/libavfilter/af_silenceremove.c
+++ b/libavfilter/af_silenceremove.c
@@ -33,8 +33,10 @@
 #include "internal.h"
 
 enum SilenceDetect {
-    D_PEAK,
+    D_AVG,
     D_RMS,
+    D_PEAK,
+    D_NB
 };
 
 enum ThresholdMode {
@@ -75,6 +77,12 @@ typedef struct SilenceRemoveContext {
     AVFrame *start_window;
     AVFrame *stop_window;
 
+    int *start_front;
+    int *start_back;
+
+    int *stop_front;
+    int *stop_back;
+
     int64_t window_duration;
 
     int start_window_pos;
@@ -100,8 +108,8 @@ typedef struct SilenceRemoveContext {
 
     int detection;
 
-    float (*compute_flt)(float *c, float s, float ws, int size);
-    double (*compute_dbl)(double *c, double s, double ws, int size);
+    float (*compute_flt)(float *c, float s, float ws, int size, int *front, int *back);
+    double (*compute_dbl)(double *c, double s, double ws, int size, int *front, int *back);
 } SilenceRemoveContext;
 
 #define OFFSET(x) offsetof(SilenceRemoveContext, x)
@@ -120,9 +128,10 @@ static const AVOption silenceremove_options[] = {
     { "stop_threshold",  "set threshold for stop silence detection",           OFFSET(stop_threshold),      AV_OPT_TYPE_DOUBLE,   {.dbl=0},     0,   DBL_MAX, AF },
     { "stop_silence",    "set stop duration of silence part to keep",          OFFSET(stop_silence_opt),    AV_OPT_TYPE_DURATION, {.i64=0},     0, INT32_MAX, AF },
     { "stop_mode",       "set which channel will trigger trimming from end",   OFFSET(stop_mode),           AV_OPT_TYPE_INT,      {.i64=T_ANY}, T_ANY, T_ALL, AF, "mode" },
-    { "detection",       "set how silence is detected",                        OFFSET(detection),           AV_OPT_TYPE_INT,      {.i64=D_RMS}, D_PEAK,D_RMS, AF, "detection" },
-    {   "peak",          "use absolute values of samples",                     0,                           AV_OPT_TYPE_CONST,    {.i64=D_PEAK},0,         0, AF, "detection" },
-    {   "rms",           "use squared values of samples",                      0,                           AV_OPT_TYPE_CONST,    {.i64=D_RMS}, 0,         0, AF, "detection" },
+    { "detection",       "set how silence is detected",                        OFFSET(detection),           AV_OPT_TYPE_INT,      {.i64=D_RMS}, 0,    D_NB-1, AF, "detection" },
+    {   "avg",           "use mean absolute values of samples",                0,                           AV_OPT_TYPE_CONST,    {.i64=D_AVG}, 0,         0, AF, "detection" },
+    {   "rms",           "use root mean squared values of samples",            0,                           AV_OPT_TYPE_CONST,    {.i64=D_RMS}, 0,         0, AF, "detection" },
+    {   "peak",          "use max absolute values of samples",                 0,                           AV_OPT_TYPE_CONST,    {.i64=D_PEAK},0,         0, AF, "detection" },
     { "window",          "set duration of window for silence detection",       OFFSET(window_duration_opt), AV_OPT_TYPE_DURATION, {.i64=20000}, 0, 100000000, AF },
     { NULL }
 };
@@ -201,7 +210,9 @@ static int config_output(AVFilterLink *outlink)
 
     s->start_window = ff_get_audio_buffer(outlink, s->window_duration);
     s->stop_window = ff_get_audio_buffer(outlink, s->window_duration);
-    if (!s->start_window || !s->stop_window)
+    s->start_cache = av_calloc(outlink->ch_layout.nb_channels, s->window_duration * sizeof(*s->start_cache));
+    s->stop_cache = av_calloc(outlink->ch_layout.nb_channels, s->window_duration * sizeof(*s->stop_cache));
+    if (!s->start_window || !s->stop_window || !s->start_cache || !s->stop_cache)
         return AVERROR(ENOMEM);
 
     s->start_queuef = ff_get_audio_buffer(outlink, s->start_silence + 1);
@@ -209,14 +220,20 @@ static int config_output(AVFilterLink *outlink)
     if (!s->start_queuef || !s->stop_queuef)
         return AVERROR(ENOMEM);
 
-    s->start_cache = av_calloc(outlink->ch_layout.nb_channels, sizeof(*s->start_cache));
-    s->stop_cache = av_calloc(outlink->ch_layout.nb_channels, sizeof(*s->stop_cache));
-    if (!s->start_cache || !s->stop_cache)
+    s->start_front = av_calloc(outlink->ch_layout.nb_channels, sizeof(*s->start_front));
+    s->start_back = av_calloc(outlink->ch_layout.nb_channels, sizeof(*s->start_back));
+    s->stop_front = av_calloc(outlink->ch_layout.nb_channels, sizeof(*s->stop_front));
+    s->stop_back = av_calloc(outlink->ch_layout.nb_channels, sizeof(*s->stop_back));
+    if (!s->start_front || !s->start_back || !s->stop_front || !s->stop_back)
         return AVERROR(ENOMEM);
 
     clear_windows(s);
 
     switch (s->detection) {
+    case D_AVG:
+        s->compute_flt = compute_avg_flt;
+        s->compute_dbl = compute_avg_dbl;
+        break;
     case D_PEAK:
         s->compute_flt = compute_peak_flt;
         s->compute_dbl = compute_peak_dbl;
@@ -374,8 +391,13 @@ static av_cold void uninit(AVFilterContext *ctx)
     av_frame_free(&s->stop_window);
     av_frame_free(&s->start_queuef);
     av_frame_free(&s->stop_queuef);
+
     av_freep(&s->start_cache);
     av_freep(&s->stop_cache);
+    av_freep(&s->start_front);
+    av_freep(&s->start_back);
+    av_freep(&s->stop_front);
+    av_freep(&s->stop_back);
 }
 
 static const AVFilterPad silenceremove_inputs[] = {
diff --git a/libavfilter/silenceremove_template.c b/libavfilter/silenceremove_template.c
index 1a12435ee6..ef63ea1e7e 100644
--- a/libavfilter/silenceremove_template.c
+++ b/libavfilter/silenceremove_template.c
@@ -99,8 +99,8 @@ static void fn(queue_sample)(AVFilterContext *ctx,
         *window_pos = 0;
 }
 
-static ftype fn(compute_peak)(ftype *cache, ftype sample, ftype wsample,
-                              int window_size)
+static ftype fn(compute_avg)(ftype *cache, ftype sample, ftype wsample,
+                             int window_size, int *unused, int *unused2)
 {
     ftype r;
 
@@ -111,8 +111,49 @@ static ftype fn(compute_peak)(ftype *cache, ftype sample, ftype wsample,
     return r / window_size;
 }
 
+static ftype fn(compute_peak)(ftype *peak, ftype sample, ftype wsample,
+                              int size, int *ffront, int *bback)
+{
+    ftype r, abs_sample = FABS(sample);
+    int front = *ffront;
+    int back = *bback;
+
+    if (front != back && abs_sample > peak[front]) {
+        while (front != back) {
+            front--;
+            if (front < 0)
+                front = size - 1;
+        }
+    }
+
+    while (front != back && abs_sample > peak[back]) {
+        back++;
+        if (back >= size)
+            back = 0;
+    }
+
+    if (front != back && FABS(wsample) == peak[front]) {
+        front--;
+        if (front < 0)
+            front = size - 1;
+    }
+
+    back--;
+    if (back < 0)
+        back = size - 1;
+    av_assert2(back != front);
+    peak[back] = abs_sample;
+
+    r = peak[front];
+
+    *ffront = front;
+    *bback = back;
+
+    return r;
+}
+
 static ftype fn(compute_rms)(ftype *cache, ftype sample, ftype wsample,
-                             int window_size)
+                             int window_size, int *unused, int *unused2)
 {
     ftype r;
 
@@ -143,6 +184,9 @@ static void fn(filter_start)(AVFilterContext *ctx,
     const int start_duration = s->start_duration;
     ftype *start_cache = (ftype *)s->start_cache;
     const int start_silence = s->start_silence;
+    int window_size = start_window_nb_samples;
+    int *front = s->start_front;
+    int *back = s->start_back;
 
     fn(queue_sample)(ctx, src, start,
                      &s->start_queue_pos,
@@ -153,15 +197,20 @@ static void fn(filter_start)(AVFilterContext *ctx,
                      start_nb_samples,
                      start_window_nb_samples);
 
+    if (s->detection != D_PEAK)
+        window_size = s->start_window_size;
+
     for (int ch = 0; ch < nb_channels; ch++) {
         ftype start_sample = start[start_pos + ch];
         ftype start_ow = startw[start_wpos + ch];
         ftype tstart;
 
-        tstart = fn(s->compute)(start_cache + ch,
+        tstart = fn(s->compute)(start_cache + ch * start_window_nb_samples,
                                 start_sample,
                                 start_ow,
-                                s->start_window_size);
+                                window_size,
+                                front + ch,
+                                back + ch);
 
         startw[start_wpos + ch] = start_sample;
 
@@ -226,6 +275,9 @@ static void fn(filter_stop)(AVFilterContext *ctx,
     ftype *stop_cache = (ftype *)s->stop_cache;
     const int stop_silence = s->stop_silence;
     const int restart = s->restart;
+    int window_size = stop_window_nb_samples;
+    int *front = s->stop_front;
+    int *back = s->stop_back;
 
     fn(queue_sample)(ctx, src, stop,
                      &s->stop_queue_pos,
@@ -236,15 +288,20 @@ static void fn(filter_stop)(AVFilterContext *ctx,
                      stop_nb_samples,
                      stop_window_nb_samples);
 
+    if (s->detection != D_PEAK)
+        window_size = s->stop_window_size;
+
     for (int ch = 0; ch < nb_channels; ch++) {
         ftype stop_sample = stop[stop_pos + ch];
         ftype stop_ow = stopw[stop_wpos + ch];
         ftype tstop;
 
-        tstop = fn(s->compute)(stop_cache + ch,
+        tstop = fn(s->compute)(stop_cache + ch * stop_window_nb_samples,
                                stop_sample,
                                stop_ow,
-                               s->stop_window_size);
+                               window_size,
+                               front + ch,
+                               back + ch);
 
         stopw[stop_wpos + ch] = stop_sample;
 
diff --git a/tests/fate/filter-audio.mak b/tests/fate/filter-audio.mak
index eff32b9f81..445c0f9217 100644
--- a/tests/fate/filter-audio.mak
+++ b/tests/fate/filter-audio.mak
@@ -184,7 +184,7 @@ fate-filter-pan-downmix2: SRC = $(TARGET_PATH)/tests/data/asynth-44100-11.wav
 fate-filter-pan-downmix2: CMD = framecrc -ss 3.14 -i $(SRC) -frames:a 20 -filter:a "pan=5C|c0=0.7*c0+0.7*c10|c1=c9|c2=c8|c3=c7|c4=c6"
 
 FATE_AFILTER-$(call ALLYES, LAVFI_INDEV, AEVALSRC_FILTER SILENCEREMOVE_FILTER) += fate-filter-silenceremove
-fate-filter-silenceremove: CMD = framecrc -auto_conversion_filters -f lavfi -i "aevalsrc=between(t\,1\,2)+between(t\,4\,5)+between(t\,7\,9):d=10:n=8192,silenceremove=start_periods=0:start_duration=0:start_threshold=0:stop_periods=-1:stop_duration=0:stop_threshold=-90dB:window=0:detection=peak"
+fate-filter-silenceremove: CMD = framecrc -auto_conversion_filters -f lavfi -i "aevalsrc=between(t\,1\,2)+between(t\,4\,5)+between(t\,7\,9):d=10:n=8192,silenceremove=start_periods=0:start_duration=0:start_threshold=0:stop_periods=-1:stop_duration=0:stop_threshold=-90dB:window=0:detection=avg"
 
 FATE_AFILTER_SAMPLES-$(call FILTERDEMDECENCMUX, STEREOTOOLS, WAV, PCM_S16LE, PCM_S16LE, WAV) += fate-filter-stereotools
 fate-filter-stereotools: SRC = $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
-- 
cgit v1.2.3