[FFmpeg-devel] [PATCH] flite audio source
Nicolas George
nicolas.george at normalesup.org
Wed Jul 25 10:56:15 CEST 2012
L'octidi 8 thermidor, an CCXX, Stefano Sabatini a écrit :
> >From 8633817e2e9495719b116b3ba08e26fcb74036c8 Mon Sep 17 00:00:00 2001
> From: Stefano Sabatini <stefasab at gmail.com>
> Date: Sun, 21 Aug 2011 02:29:33 +0200
> Subject: [PATCH] lavfi: add flite audio source
>
> ---
> configure | 5 +
> doc/filters.texi | 56 ++++++++++
> libavfilter/Makefile | 1 +
> libavfilter/allfilters.c | 1 +
> libavfilter/asrc_flite.c | 250 ++++++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 313 insertions(+), 0 deletions(-)
> create mode 100644 libavfilter/asrc_flite.c
>
> diff --git a/configure b/configure
> index 7363f9a..519cd63 100755
> --- a/configure
> +++ b/configure
> @@ -180,6 +180,7 @@ External library support:
> and libraw1394 [no]
> --enable-libfaac enable AAC encoding via libfaac [no]
> --enable-libfdk-aac enable AAC encoding via libfdk-aac [no]
> + --enable-libflite enable flite (voice synthesis) support via libflite [no]
> --enable-libfreetype enable libfreetype [no]
> --enable-libgsm enable GSM de/encoding via libgsm [no]
> --enable-libiec61883 enable iec61883 via libiec61883 [no]
> @@ -1073,6 +1074,7 @@ CONFIG_LIST="
> libdc1394
> libfaac
> libfdk_aac
> + libflite
> libfreetype
> libgsm
> libiec61883
> @@ -1784,6 +1786,7 @@ cropdetect_filter_deps="gpl"
> delogo_filter_deps="gpl"
> deshake_filter_deps="avcodec"
> drawtext_filter_deps="libfreetype"
> +flite_filter_deps="libflite"
> frei0r_filter_deps="frei0r dlopen"
> frei0r_filter_extralibs='$ldl'
> frei0r_src_filter_deps="frei0r dlopen"
> @@ -3340,6 +3343,8 @@ enabled libcelt && require libcelt celt/celt.h celt_decode -lcelt0 &&
> enabled libcaca && require_pkg_config caca caca.h caca_create_canvas
> enabled libfaac && require2 libfaac "stdint.h faac.h" faacEncGetVersion -lfaac
> enabled libfdk_aac && require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac
> +flite_libs="-lflite_cmu_time_awb -lflite_cmu_us_awb -lflite_cmu_us_kal -lflite_cmu_us_kal16 -lflite_cmu_us_rms -lflite_cmu_us_slt -lflite_usenglish -lflite_cmulex -lflite"
> +enabled libflite && require2 libflite "flite/flite.h" flite_init $flite_libs
> enabled libfreetype && require_pkg_config freetype2 "ft2build.h freetype/freetype.h" FT_Init_FreeType
> enabled libgsm && require libgsm gsm/gsm.h gsm_create -lgsm
> enabled libilbc && require libilbc ilbc.h WebRtcIlbcfix_InitDecode -lilbc
> diff --git a/doc/filters.texi b/doc/filters.texi
> index d328f39..6596421 100644
> --- a/doc/filters.texi
> +++ b/doc/filters.texi
> @@ -1031,6 +1031,62 @@ Channel layout of the audio data, in the form that can be accepted by
>
> All the parameters need to be explicitly defined.
>
> + at section flite
> +
> +Synthesize a voice utterance using the libflite library.
> +
> +To enable compilation of this filter you need to configure FFmpeg with
> + at code{--enable-libflite}.
> +
> +The source accepts parameters as a list of @var{key}=@var{value} pairs,
> +separated by ":".
> +
> +The description of the accepted parameters follows.
> +
> + at table @option
> +
> + at item textfile
> +Set the filename containing the text to speech.
> +
> + at item text
> +Set the text to speech.
A native English speaker should confirm, but I believe it should rather be
"speak". At least, it seems that "speech" is very rarely a verb.
> +
> + at item voice, v
> +Set the voice to use for the speech synthesis. Currently recognized
> +voices are: @code{awb}, @code{kal}, @code{kal16}, @code{rms},
> + at code{slt}. Default value is @code{kal}.
> +
> + at item nb_samples, n
> +Set the maximum number of samples per frame. Default value is 512.
> + at end table
> +
> + at section Examples
> +
> + at itemize
> + at item
> +Read from file @file{speech.txt}, and synthetize the text using the
> +standard flite voice:
> + at example
> +flite=textfile=speech.txt
> + at end example
> +
> + at item
> +Read the specified text selecting the @code{slt} voice:
> + at example
> +flite=text='So fare thee well, poor devil of a Sub-Sub, whose commentator I am':voice=slt
> + at end example
> +
> + at item
> +Make @file{ffplay} speech the specified text, using @code{flite} and
> +the @code{lavfi} device:
> + at example
> +ffplay -f lavfi flite='No more be grieved for which that thou hast done.'
> + at end example
> + at end itemize
> +
> +For more information about libflite, check:
> + at url{http://www.speech.cs.cmu.edu/flite/}
> +
> @c man end AUDIO SOURCES
>
> @chapter Audio Sinks
> diff --git a/libavfilter/Makefile b/libavfilter/Makefile
> index 15ca959..5de1627 100644
> --- a/libavfilter/Makefile
> +++ b/libavfilter/Makefile
> @@ -71,6 +71,7 @@ OBJS-$(CONFIG_VOLUME_FILTER) += af_volume.o
> OBJS-$(CONFIG_AEVALSRC_FILTER) += asrc_aevalsrc.o
> OBJS-$(CONFIG_AMOVIE_FILTER) += src_movie.o
> OBJS-$(CONFIG_ANULLSRC_FILTER) += asrc_anullsrc.o
> +OBJS-$(CONFIG_FLITE_FILTER) += asrc_flite.o
>
> OBJS-$(CONFIG_ABUFFERSINK_FILTER) += sink_buffer.o
> OBJS-$(CONFIG_ANULLSINK_FILTER) += asink_anullsink.o
> diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
> index 58a3c14..2811d41 100644
> --- a/libavfilter/allfilters.c
> +++ b/libavfilter/allfilters.c
> @@ -60,6 +60,7 @@ void avfilter_register_all(void)
> REGISTER_FILTER (AEVALSRC, aevalsrc, asrc);
> REGISTER_FILTER (AMOVIE, amovie, asrc);
> REGISTER_FILTER (ANULLSRC, anullsrc, asrc);
> + REGISTER_FILTER (FLITE, flite, asrc);
>
> REGISTER_FILTER (ABUFFERSINK, abuffersink, asink);
> REGISTER_FILTER (ANULLSINK, anullsink, asink);
> diff --git a/libavfilter/asrc_flite.c b/libavfilter/asrc_flite.c
> new file mode 100644
> index 0000000..3f7b75e
> --- /dev/null
> +++ b/libavfilter/asrc_flite.c
> @@ -0,0 +1,250 @@
> +/*
> + * Copyright (c) 2012 Stefano Sabatini
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * flite voice synth source
> + */
> +
> +#include <flite/flite.h>
> +#include "libavutil/audioconvert.h"
> +#include "libavutil/file.h"
> +#include "libavutil/opt.h"
> +#include "avfilter.h"
> +#include "audio.h"
> +#include "formats.h"
> +#include "internal.h"
> +
> +typedef struct {
> + const AVClass *class;
> + char *voice_str;
> + char *textfile;
> + char *text;
> + cst_wave *wave;
> + int16_t *wave_samples;
> + int wave_nb_samples;
> + cst_voice *voice;
> + int64_t pts;
> + int frame_nb_samples; ///< number of samples per frame
> +} FliteContext;
> +
> +#define OFFSET(x) offsetof(FliteContext, x)
> +
> +static const AVOption flite_options[] = {
> + { "textfile", "set filename of the text to speech", OFFSET(textfile), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX },
> + { "text", "set text to speech", OFFSET(text), AV_OPT_TYPE_STRING, {.str=NULL}, CHAR_MIN, CHAR_MAX },
> + { "voice", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX },
> + { "v", "set voice", OFFSET(voice_str), AV_OPT_TYPE_STRING, {.str="kal"}, CHAR_MIN, CHAR_MAX },
> + { "nb_samples", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.dbl=512}, 0, INT_MAX },
> + { "n", "set number of samples per frame", OFFSET(frame_nb_samples), AV_OPT_TYPE_INT, {.dbl=512}, 0, INT_MAX },
> + { NULL }
> +};
> +
> +AVFILTER_DEFINE_CLASS(flite);
> +
> +static int flite_inited = 0;
Maybe "volatile", to be slightly more sure wrt threads.
> +
> +/* declare functions for all the supported voices */
> +#define DECLARE_REGISTER_VOICE_FN(name) cst_voice *register_cmu_us_## name(void *)
> +
> +DECLARE_REGISTER_VOICE_FN(awb);
> +DECLARE_REGISTER_VOICE_FN(kal);
> +DECLARE_REGISTER_VOICE_FN(kal16);
> +DECLARE_REGISTER_VOICE_FN(rms);
> +DECLARE_REGISTER_VOICE_FN(slt);
> +
> +struct voice_entry {
> + const char *name;
> + cst_voice * (*register_fn)(void *);
> +} voice_entry;
> +
> +static cst_voice *select_voice(const char *voice_name)
> +{
> + int i;
> + static struct voice_entry voice_entries[] = {
> + { "awb", register_cmu_us_awb },
> + { "kal", register_cmu_us_kal },
> + { "kal16", register_cmu_us_kal16 },
> + { "rms", register_cmu_us_rms },
> + { "slt", register_cmu_us_slt },
> + };
I wonder whether a smart use of macros would allow to avoid duplicating the
list. But do not consider it a blocker.
> +
> + for (i = 0; i < FF_ARRAY_ELEMS(voice_entries); i++) {
> + struct voice_entry *voice = &voice_entries[i];
> + if (!strcmp(voice->name, voice_name))
> + return voice->register_fn(NULL);
> + }
> + return NULL;
> +}
> +
> +static av_cold int init(AVFilterContext *ctx, const char *args)
> +{
> + FliteContext *flite = ctx->priv;
> + int err = 0;
> +
> + flite->class = &flite_class;
> + av_opt_set_defaults(flite);
> +
> + if ((err = av_set_options_string(flite, args, "=", ":")) < 0) {
> + av_log(ctx, AV_LOG_ERROR, "Error parsing options string: '%s'\n", args);
> + return err;
> + }
> +
> + if (!flite_inited) {
> + if (flite_init() < 0) {
> + av_log(ctx, AV_LOG_ERROR, "flite initialization failed\n");
> + return AVERROR_UNKNOWN;
> + }
> + flite_inited++;
> + }
> +
> + flite->voice = select_voice(flite->voice_str);
> + if (!flite->voice) {
> + av_log(ctx, AV_LOG_ERROR, "Impossible to select voice '%s'\n", flite->voice_str);
> + return AVERROR(EINVAL);
> + }
Can register_fn fail? If not, "Unknown voice" (plus maybe the list of known
voices) would be more accurate. If yes, distinguishing between unknown and
failure would be better.
> +
> + if (flite->textfile && flite->text) {
> + av_log(ctx, AV_LOG_ERROR,
> + "Both text and textfile options set. Only one must be specified\n");
Having a terminated sentence and then an unterminated one seems really
strange. Maybe use a semicolon in the middle.
> + return AVERROR(EINVAL);
> + }
> +
> + if (flite->textfile) {
> + uint8_t *textbuf;
> + size_t textbuf_size;
> +
> + if (flite->text) {
> + }
Looks like a leftover of something.
> + if ((err = av_file_map(flite->textfile, &textbuf, &textbuf_size, 0, ctx)) < 0) {
> + av_log(ctx, AV_LOG_ERROR,
> + "The text file '%s' could not be read\n", flite->textfile);
"... could not be read: %s", ..., av_err2str(ret)? The same message will
_probably_ be displayed later, but the information belong here, and there is
a message anyway.
> + return err;
> + }
> +
> + if (!(flite->text = av_malloc(textbuf_size+1)))
> + return AVERROR(ENOMEM);
> + memcpy(flite->text, textbuf, textbuf_size);
> + flite->text[textbuf_size] = 0;
> + av_file_unmap(textbuf, textbuf_size);
> + }
> +
> + if (!flite->text) {
> + av_log(ctx, AV_LOG_ERROR, "No speech text specified, specify the 'text' or 'textfile' option\n");
> + return AVERROR(EINVAL);
> + }
> +
> + /* synth all the file data in block */
> + flite->wave = flite_text_to_wave(flite->text, flite->voice);
> + flite->wave_samples = flite->wave->samples;
> + flite->wave_nb_samples = flite->wave->num_samples;
> + return 0;
> +}
> +
> +static av_cold void uninit(AVFilterContext *ctx)
> +{
> + FliteContext *flite = ctx->priv;
> +
> + av_opt_free(flite);
> +
> + delete_voice(flite->voice);
> + flite->voice = NULL;
> + delete_wave(flite->wave);
> + flite->wave = NULL;
> +}
> +
> +static int config_props(AVFilterLink *outlink)
> +{
> + AVFilterContext *ctx = outlink->src;
> + FliteContext *flite = ctx->priv;
> +
> + outlink->sample_rate = flite->wave->sample_rate;
> + outlink->time_base = (AVRational){1, flite->wave->sample_rate};
> +
> + av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n",
> + flite->voice_str,
> + av_get_sample_fmt_name(outlink->format), outlink->sample_rate);
> + return 0;
> +}
> +
> +static int query_formats(AVFilterContext *ctx)
I believe it comes, logically, before config_props.
> +{
> + FliteContext *flite = ctx->priv;
> +
> + AVFilterChannelLayouts *chlayouts = NULL;
> + int64_t chlayout = av_get_default_channel_layout(flite->wave->num_channels);
> + AVFilterFormats *sample_formats = NULL;
> + AVFilterFormats *sample_rates = NULL;
> +
> + ff_add_channel_layout(&chlayouts, chlayout);
> + ff_set_common_channel_layouts(ctx, chlayouts);
> + ff_add_format(&sample_formats, AV_SAMPLE_FMT_S16);
> + ff_set_common_formats(ctx, sample_formats);
> + ff_add_format(&sample_rates, flite->wave->sample_rate);
> + ff_set_common_samplerates (ctx, sample_rates);
> +
> + return 0;
> +}
> +
> +static int request_frame(AVFilterLink *outlink)
> +{
> + AVFilterBufferRef *samplesref;
> + FliteContext *flite = outlink->src->priv;
> + int nb_samples = FFMIN(flite->wave_nb_samples, flite->frame_nb_samples);
> +
> + if (!nb_samples)
> + return AVERROR_EOF;
> +
> + samplesref = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
> + if (!samplesref)
> + return AVERROR(ENOMEM);
> +
> + memcpy(samplesref->data[0], flite->wave_samples,
> + nb_samples * flite->wave->num_channels * 2);
> + samplesref->pts = flite->pts;
> + samplesref->pos = -1;
> + samplesref->audio->sample_rate = flite->wave->sample_rate;
> + flite->pts += nb_samples;
> + flite->wave_samples += nb_samples * flite->wave->num_channels;
> + flite->wave_nb_samples -= nb_samples;
> +
> + return ff_filter_samples(outlink, samplesref);
> +}
> +
> +AVFilter avfilter_asrc_flite = {
> + .name = "flite",
> + .description = NULL_IF_CONFIG_SMALL("Synthesize voice from text using libflite."),
> + .query_formats = query_formats,
> + .init = init,
> + .uninit = uninit,
> + .priv_size = sizeof(FliteContext),
> +
> + .inputs = (const AVFilterPad[]) {{ .name = NULL}},
> +
> + .outputs = (const AVFilterPad[]) {
> + {
> + .name = "default",
> + .type = AVMEDIA_TYPE_AUDIO,
> + .config_props = config_props,
> + .request_frame = request_frame,
> + },
> + { .name = NULL }
> + },
> +};
I did not grade the level of nitpicking. Most of the comments are minor.
Thanks for the work.
Regards,
--
Nicolas George
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20120725/34992cb1/attachment.asc>
More information about the ffmpeg-devel
mailing list