[FFmpeg-devel] [PATCH v4 1/2] avfilter: add audio overlay filter

Mon Jan 22 03:20:06 EET 2024

On date Tuesday 2024-01-16 17:46:42 +0530, Harshit Karwal wrote:
> Co-authored-by: Paul B Mahol <onemda at gmail.com>
> Signed-off-by: Harshit Karwal <karwalharshit at gmail.com>
> ---
>  doc/filters.texi          |  40 +++
>  libavfilter/Makefile      |   1 +
>  libavfilter/af_aoverlay.c | 538 ++++++++++++++++++++++++++++++++++++++
>  libavfilter/allfilters.c  |   1 +
>  4 files changed, 580 insertions(+)
>  create mode 100644 libavfilter/af_aoverlay.c
> 
> diff --git a/doc/filters.texi b/doc/filters.texi
> index 20c91bab3a..79eb600ae3 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -2779,6 +2779,46 @@ This filter supports the same commands as options, excluding option @code{order}
>  
>  Pass the audio source unchanged to the output.
>  
> + at section aoverlay
> +
> +Replace a specified section of an audio stream with another input audio stream.
> +

> +In case no enable option for timeline editing is specified, the second audio stream will

nit: @option{enable}

> +be output at sections of the first stream which have a gap in PTS (Presentation TimeStamp) values
> +such that the output stream's PTS values are monotonous.
> +
> +This filter also supports linear cross fading when transitioning from one
> +input stream to another.
> +

> +The filter accepts the following option:

nit: options in case we add more

> +

> + at table @option
> + at item cf_duration
> +Set duration (in seconds) for cross fade between the inputs. Default value is @code{100} milliseconds.
> + at end table
> +
> + at subsection Examples
> +
> + at itemize
> + at item
> +Replace the first stream with the second stream from @code{t=10} seconds to @code{t=20} seconds:
> + at example
> +ffmpeg -i first.wav -i second.wav -filter_complex "aoverlay=enable='between(t,10,20)'" output.wav
> + at end example
> +
> + at item
> +Do the same as above, but with crossfading for @code{2} seconds between the streams:
> + at example
> +ffmpeg -i first.wav -i second.wav -filter_complex "aoverlay=cf_duration=2:enable='between(t,10,20)'" output.wav
> + at end example
> +
> + at item
> +Introduce a PTS gap from @code{t=4} seconds to @code{t=8} seconds in the first stream and output the second stream during this gap:
> + at example
> +ffmpeg -i first.wav -i second.wav -filter_complex "[0]aselect='not(between(t,4,8))'[temp];[temp][1]aoverlay[out]" -map "[out]" output.wav
> + at end example
> + at end itemize
> +
>  @section apad
>  
>  Pad the end of an audio stream with silence.
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index bba0219876..0f2b403441 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -81,6 +81,7 @@ OBJS-$(CONFIG_ANLMDN_FILTER)                 += af_anlmdn.o
>  OBJS-$(CONFIG_ANLMF_FILTER)                  += af_anlms.o
>  OBJS-$(CONFIG_ANLMS_FILTER)                  += af_anlms.o
>  OBJS-$(CONFIG_ANULL_FILTER)                  += af_anull.o
> +OBJS-$(CONFIG_AOVERLAY_FILTER)               += af_aoverlay.o
>  OBJS-$(CONFIG_APAD_FILTER)                   += af_apad.o
>  OBJS-$(CONFIG_APERMS_FILTER)                 += f_perms.o
>  OBJS-$(CONFIG_APHASER_FILTER)                += af_aphaser.o generate_wave_table.o
> diff --git a/libavfilter/af_aoverlay.c b/libavfilter/af_aoverlay.c
> new file mode 100644
> index 0000000000..f7ac00dda1
> --- /dev/null
> +++ b/libavfilter/af_aoverlay.c
[...]
> +static int crossfade_prepare(AOverlayContext *s, AVFilterLink *main_inlink, AVFilterLink *overlay_inlink, AVFilterLink *outlink,
> +                             int nb_samples, AVFrame **main_buffer, AVFrame **overlay_buffer, int mode)
> +{
> +    int ret;
> +
> +    *main_buffer = ff_get_audio_buffer(outlink, nb_samples);
> +    if (!(*main_buffer))
> +        return AVERROR(ENOMEM);
> +
> +    (*main_buffer)->pts = s->pts;
> +    s->pts += av_rescale_q(nb_samples, (AVRational){ 1, outlink->sample_rate }, outlink->time_base);
> +
> +    if ((ret = av_audio_fifo_read(s->main_sample_buffers, (void **)(*main_buffer)->extended_data, nb_samples)) < 0)
> +        return ret;
> +

> +    if (mode == 1) {
> +        s->previous_samples = (*main_buffer)->nb_samples;
> +    } else if (mode == -1 || (mode == 0 && s->is_disabled)) {

it would help to use an enum to describe the mode value

Also would help to introduce some debug log messages to aid
troubleshooting/debugging.

For instance, it would be very useful to show the exact time when the
overlay stream is inserted.

[...]
> +static int activate(AVFilterContext *ctx)
> +{
> +    AOverlayContext *s = ctx->priv;
> +    int status, ret, nb_samples;
> +    int64_t pts;
> +    AVFrame *out = NULL, *main_buffer = NULL, *overlay_buffer = NULL;
> +
> +    AVFilterLink *main_inlink = ctx->inputs[0];
> +    AVFilterLink *overlay_inlink = ctx->inputs[1];
> +    AVFilterLink *outlink = ctx->outputs[0];
> +
> +    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, ctx);
> +
> +    if (s->default_mode && (s->pts_gap_end - s->pts_gap_start <= 0 || s->overlay_eof)) {
> +        s->default_mode = 0;
> +        s->transition_pts2 = s->pts_gap_end;
> +    }
> +
> +    if (av_audio_fifo_space(s->main_sample_buffers) != 0 && !s->main_eof && !s->default_mode) {
> +        nb_samples = FFMIN(SEGMENT_SIZE, av_audio_fifo_space(s->main_sample_buffers));
> +
> +        ret = ff_inlink_consume_samples(main_inlink, nb_samples, nb_samples, &s->main_input);
> +        if (ret > 0) {
> +            if (ctx->enable_str && s->is_disabled != ctx->is_disabled && !s->overlay_eof) {
> +                s->is_disabled = ctx->is_disabled;
> +                s->transition_pts = s->main_input->pts;
> +
> +                if (s->main_input->nb_samples < av_audio_fifo_space(s->main_sample_buffers))
> +                    s->crossfade_ready = 1;
> +                if (av_audio_fifo_size(s->main_sample_buffers) == 0) {
> +                    s->transition_pts = AV_NOPTS_VALUE;
> +                    s->crossfade_ready = 0;
> +                }

> +            }
> +            if (!ctx->enable_str && !s->default_mode) {

nit: else if to avoid this evaluation in case the first block is executed

[...]
> +
> +static int config_output(AVFilterLink *outlink)
> +{
> +    AVFilterContext *ctx = outlink->src;
> +    AOverlayContext *s = ctx->priv;
> +    int size, fifo_size;
> +
> +    switch (outlink->format) {
> +    case AV_SAMPLE_FMT_DBLP: s->crossfade_samples = crossfade_samples_dblp;
> +                             size = sizeof(double);
> +                             break;
> +    case AV_SAMPLE_FMT_FLTP: s->crossfade_samples = crossfade_samples_fltp;
> +                             size = sizeof(float);
> +                             break;
> +    case AV_SAMPLE_FMT_S16P: s->crossfade_samples = crossfade_samples_s16p;
> +                             size = sizeof(int16_t);
> +                             break;
> +    case AV_SAMPLE_FMT_S32P: s->crossfade_samples = crossfade_samples_s32p;
> +                             size = sizeof(int32_t);
> +                             break;
> +    }
> +

> +    if (s->cf_duration)
> +        s->cf_samples = av_rescale(s->cf_duration, outlink->sample_rate, AV_TIME_BASE);

> +    else
> +        s->cf_samples = av_rescale(100000, outlink->sample_rate, AV_TIME_BASE);

is this needed? shouldn't the duration be set also for the default
case?

[...]

Thanks