[FFmpeg-devel] [PATCH] Whisper audio filter
Michael Niedermayer
michael at niedermayer.cc
Sat Jul 19 03:15:53 EEST 2025
Hi Vittorio
On Thu, Jul 17, 2025 at 10:51:57AM +0200, Vittorio Palmisano wrote:
> It adds a new audio filter for running audio transcriptions with the whisper model.
> Documentation and examples are included into the patch.
>
> Signed-off-by: Vittorio Palmisano <vpalmisano at gmail.com>
> ---
> configure | 5 +
> doc/filters.texi | 107 +++++++++
> libavfilter/Makefile | 2 +
> libavfilter/af_whisper.c | 452 +++++++++++++++++++++++++++++++++++++++
> libavfilter/allfilters.c | 2 +
> 5 files changed, 568 insertions(+)
> create mode 100644 libavfilter/af_whisper.c
[...]
> +static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
> +{
> + AVFilterContext *ctx = (AVFilterContext *) user_data;
> + switch (level) {
> + case GGML_LOG_LEVEL_ERROR:
> + av_log(ctx, AV_LOG_ERROR, "%s", text);
> + break;
> + case GGML_LOG_LEVEL_WARN:
> + av_log(ctx, AV_LOG_WARNING, "%s", text);
> + break;
> + case GGML_LOG_LEVEL_INFO:
> + case GGML_LOG_LEVEL_DEBUG:
> + av_log(ctx, AV_LOG_DEBUG, "%s", text);
> + break;
> + }
> +}
you can factor the function calls out of the switch/case
> +
> +static int init(AVFilterContext *ctx)
> +{
> + WhisperContext *wctx = ctx->priv;
> +
> + static AVOnce init_static_once = AV_ONCE_INIT;
> + ff_thread_once(&init_static_once, ggml_backend_load_all);
> +
> + whisper_log_set(cb_log, ctx);
> +
> + // Init whisper context
> + if (!wctx->model_path) {
> + av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");
> + return AVERROR(EINVAL);
> + }
> +
> + struct whisper_context_params params = whisper_context_default_params();
> + params.use_gpu = wctx->use_gpu;
> + params.gpu_device = wctx->gpu_device;
> +
> + wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);
> + if (wctx->ctx_wsp == NULL) {
> + av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);
> + return AVERROR(EIO);
> + }
> +
> + // Init buffer
> + wctx->audio_buffer_queue_size = WHISPER_SAMPLE_RATE * wctx->queue / 1000000;
The multiplication can overflow also the 32bit output could overflow
best is probably to limit queue to a more reasonable value than INT64_MAX
> + wctx->audio_buffer = av_malloc(wctx->audio_buffer_queue_size * sizeof(*wctx->audio_buffer));
av_calloc() or av_malloc_array()
[...]
> +static void run_transcription(AVFilterContext *ctx, AVDictionary **metadata, int end_pos)
> +{
> + WhisperContext *wctx = ctx->priv;
> + end_pos = FFMAX(0, FFMIN(end_pos, wctx->audio_buffer_fill_size));
> +
> + if (!wctx->ctx_wsp || end_pos == 0)
> + return;
> +
> + float duration = (float) end_pos / WHISPER_SAMPLE_RATE;
[...]
> + wctx->timestamp += duration * 1000;
floats are not precise and the accumulated rounding errors will
add up and lead to synchronization issues between the subtitles
and audio or video over a long enough timespan
Also for reproducability this should use integers
what you could do, is to use:
wctx->timestamp += end_pos;
and then replace every use of wctx->timestamp by wctx->timestamp / WHISPER_SAMPLE_RATE
or wctx->timestamp / (double)WHISPER_SAMPLE_RATE if the context demands a
double for example
that way the code is exact and no errors accumulate
> +
> + if (metadata && segments_text) {
> + av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);
> + char *duration_text = av_asprintf("%f", duration);
> + av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL);
> + }
> + av_freep(&segments_text);
> +
> + memcpy(wctx->audio_buffer, wctx->audio_buffer + end_pos, end_pos * sizeof(*wctx->audio_buffer));
> + wctx->audio_buffer_fill_size -= end_pos;
> + wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;
> +}
> +
> +static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
> +{
> + AVFilterContext *ctx = inlink->dst;
> + WhisperContext *wctx = ctx->priv;
> + AVFilterLink *outlink = ctx->outputs[0];
> + AVDictionary **metadata = &frame->metadata;
> +
> + const int samples = frame->nb_samples;
> + const float *input_data = (const float *) frame->data[0];
> +
> + if (wctx->audio_buffer_fill_size + samples > wctx->audio_buffer_queue_size) {
> + run_transcription(ctx, metadata, wctx->audio_buffer_fill_size);
> + }
> +
> + memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer));
> + wctx->audio_buffer_fill_size += samples;
> +
> + if (wctx->ctx_vad
> + && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=
> + WHISPER_SAMPLE_RATE * (wctx->vad_min_speech_duration + wctx->vad_min_silence_duration) / 1000000) {
> + struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad,
> + wctx->vad_params,
> + wctx->audio_buffer,
> + wctx->audio_buffer_fill_size);
> + wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;
> +
> + if (!segments) {
> + av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");
> + } else {
> + int n_segments = whisper_vad_segments_n_segments(segments);
> +
> + if (n_segments > 0) {
> + const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;
> + const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;
> + int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);
> +
> + if (end_pos <= wctx->audio_buffer_fill_size - WHISPER_SAMPLE_RATE * wctx->vad_min_silence_duration / 1000000) {
> + av_log(ctx, AV_LOG_INFO,
> + "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",
> + n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
> + run_transcription(ctx, metadata, end_pos);
> + }
> + }
> +
> + whisper_vad_free_segments(segments);
> + }
> + } else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size)
> + run_transcription(ctx, metadata, wctx->audio_buffer_fill_size);
> +
> + wctx->next_pts = frame->pts + av_rescale_q(frame->nb_samples, (AVRational) {
> + 1, inlink->sample_rate}
> + , inlink->time_base);
I think you should consistently use samples or frame->nb_samples, they are the same
value i think
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Never trust a computer, one day, it may think you are the virus. -- Compn
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 195 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20250719/e23b40c7/attachment.sig>
More information about the ffmpeg-devel
mailing list