[FFmpeg-cvslog] avfilter/af_speechnorm: implement rms option

Mon Nov 7 09:32:29 EET 2022

ffmpeg | branch: master | Paul B Mahol <onemda at gmail.com> | Sun Nov  6 14:03:27 2022 +0100| [7027101904a483e1c6a6db89b47b3362c37155a0] | committer: Paul B Mahol

avfilter/af_speechnorm: implement rms option

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=7027101904a483e1c6a6db89b47b3362c37155a0
---

 doc/filters.texi            |  4 ++++
 libavfilter/af_speechnorm.c | 33 +++++++++++++++++++++++++++------
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/doc/filters.texi b/doc/filters.texi
index b50582eb2a..18b14fc376 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -6338,6 +6338,10 @@ option. When enabled any half-cycle of samples with their local peak value below
 Link channels when calculating gain applied to each filtered channel sample, by default is disabled.
 When disabled each filtered channel gain calculation is independent, otherwise when this option
 is enabled the minimum of all possible gains for each filtered channel is used.
+
+ at item rms, m
+Set the expansion target RMS value. This specifies the highest allowed RMS level for the normalized
+audio input. Default value is 0.0, thus disabled. Allowed range is from 0.0 to 1.0.
 @end table
 
 @subsection Commands
diff --git a/libavfilter/af_speechnorm.c b/libavfilter/af_speechnorm.c
index c9bd8d5cac..fd6b7d9a32 100644
--- a/libavfilter/af_speechnorm.c
+++ b/libavfilter/af_speechnorm.c
@@ -46,6 +46,7 @@ typedef struct PeriodItem {
     int size;
     int type;
     double max_peak;
+    double rms_sum;
 } PeriodItem;
 
 typedef struct ChannelContext {
@@ -54,6 +55,7 @@ typedef struct ChannelContext {
     PeriodItem pi[MAX_ITEMS];
     double gain_state;
     double pi_max_peak;
+    double pi_rms_sum;
     int pi_start;
     int pi_end;
     int pi_size;
@@ -62,6 +64,7 @@ typedef struct ChannelContext {
 typedef struct SpeechNormalizerContext {
     const AVClass *class;
 
+    double rms_value;
     double peak_value;
     double max_expansion;
     double max_compression;
@@ -110,6 +113,8 @@ static const AVOption speechnorm_options[] = {
     { "i",      "set inverted filtering", OFFSET(invert), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
     { "link", "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
     { "l",    "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
+    { "rms", "set the RMS value", OFFSET(rms_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.0}, 0.0, 1.0, FLAGS },
+    { "m",   "set the RMS value", OFFSET(rms_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.0}, 0.0, 1.0, FLAGS },
     { NULL }
 };
 
@@ -161,12 +166,16 @@ static void consume_pi(ChannelContext *cc, int nb_samples)
     }
 }
 
-static double next_gain(AVFilterContext *ctx, double pi_max_peak, int bypass, double state)
+static double next_gain(AVFilterContext *ctx, double pi_max_peak, int bypass, double state,
+                        double pi_rms_sum, int pi_size)
 {
     SpeechNormalizerContext *s = ctx->priv;
-    const double expansion = FFMIN(s->max_expansion, s->peak_value / pi_max_peak);
     const double compression = 1. / s->max_compression;
     const int type = s->invert ? pi_max_peak <= s->threshold_value : pi_max_peak >= s->threshold_value;
+    double expansion = FFMIN(s->max_expansion, s->peak_value / pi_max_peak);
+
+    if (s->rms_value > DBL_EPSILON)
+        expansion = FFMIN(expansion, s->rms_value / sqrt(pi_rms_sum / pi_size));
 
     if (bypass) {
         return 1.;
@@ -187,13 +196,15 @@ static void next_pi(AVFilterContext *ctx, ChannelContext *cc, int bypass)
         av_assert1(cc->pi[start].size > 0);
         av_assert0(cc->pi[start].type > 0 || s->eof);
         cc->pi_size = cc->pi[start].size;
+        cc->pi_rms_sum = cc->pi[start].rms_sum;
         cc->pi_max_peak = cc->pi[start].max_peak;
         av_assert1(cc->pi_start != cc->pi_end || s->eof);
         start++;
         if (start >= MAX_ITEMS)
             start = 0;
         cc->pi_start = start;
-        cc->gain_state = next_gain(ctx, cc->pi_max_peak, bypass, cc->gain_state);
+        cc->gain_state = next_gain(ctx, cc->pi_max_peak, bypass, cc->gain_state,
+                                   cc->pi_rms_sum, cc->pi_size);
     }
 }
 
@@ -209,7 +220,8 @@ static double min_gain(AVFilterContext *ctx, ChannelContext *cc, int max_size)
     while (size <= max_size) {
         if (idx == cc->pi_end)
             break;
-        gain_state = next_gain(ctx, cc->pi[idx].max_peak, 0, gain_state);
+        gain_state = next_gain(ctx, cc->pi[idx].max_peak, 0, gain_state,
+                               cc->pi[idx].rms_sum, cc->pi[idx].size);
         min_gain = FFMIN(min_gain, gain_state);
         size += cc->pi[idx].size;
         idx++;
@@ -236,11 +248,13 @@ static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,
                                                                                 \
     while (n < nb_samples) {                                                    \
         ptype new_max_peak;                                                     \
+        ptype new_rms_sum;                                                      \
         int new_size;                                                           \
                                                                                 \
         if ((cc->state != (src[n] >= zero)) ||                                  \
             (pi[pi_end].size > max_period)) {                                   \
             ptype max_peak = pi[pi_end].max_peak;                               \
+            ptype rms_sum = pi[pi_end].rms_sum;                                 \
             int state = cc->state;                                              \
                                                                                 \
             cc->state = src[n] >= zero;                                         \
@@ -251,10 +265,13 @@ static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,
                 pi_end++;                                                       \
                 if (pi_end >= MAX_ITEMS)                                        \
                     pi_end = 0;                                                 \
-                if (cc->state != state)                                         \
+                if (cc->state != state) {                                       \
                     pi[pi_end].max_peak = DBL_MIN;                              \
-                else                                                            \
+                    pi[pi_end].rms_sum = 0.0;                                   \
+                } else {                                                        \
                     pi[pi_end].max_peak = max_peak;                             \
+                    pi[pi_end].rms_sum = rms_sum;                               \
+                }                                                               \
                 pi[pi_end].type = 0;                                            \
                 pi[pi_end].size = 0;                                            \
                 av_assert1(pi_end != cc->pi_start);                             \
@@ -262,10 +279,12 @@ static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,
         }                                                                       \
                                                                                 \
         new_max_peak = pi[pi_end].max_peak;                                     \
+        new_rms_sum = pi[pi_end].rms_sum;                                       \
         new_size = pi[pi_end].size;                                             \
         if (cc->state) {                                                        \
             while (src[n] >= zero) {                                            \
                 new_max_peak = FFMAX(new_max_peak,  src[n]);                    \
+                new_rms_sum += src[n] * src[n];                                 \
                 new_size++;                                                     \
                 n++;                                                            \
                 if (n >= nb_samples)                                            \
@@ -274,6 +293,7 @@ static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,
         } else {                                                                \
             while (src[n] < zero) {                                             \
                 new_max_peak = FFMAX(new_max_peak, -src[n]);                    \
+                new_rms_sum += src[n] * src[n];                                 \
                 new_size++;                                                     \
                 n++;                                                            \
                 if (n >= nb_samples)                                            \
@@ -282,6 +302,7 @@ static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,
         }                                                                       \
                                                                                 \
         pi[pi_end].max_peak = new_max_peak;                                     \
+        pi[pi_end].rms_sum = new_rms_sum;                                       \
         pi[pi_end].size = new_size;                                             \
     }                                                                           \
     cc->pi_end = pi_end;                                                        \