[FFmpeg-devel] [PATCH 2/2] lavc: support subtitles charset conversion.
Nicolas George
nicolas.george at normalesup.org
Sat Feb 16 11:00:18 CET 2013
Le septidi 27 pluviôse, an CCXXI, Clement Boesch a écrit :
> Rebased on master. The two branches are now updated on my github. Also
> attached the two patches from the -nofilter one.
Thanks.
> From eca1faebbf3e21bf4925ae26f4ceb9eef4afb744 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= <ubitux at gmail.com>
> Date: Sat, 5 Jan 2013 11:06:31 +0100
> Subject: [PATCH 1/2] lavc: mark bitmap based subtitles codecs as such.
I am still not sure this is useful, but it looks correct. Wait for
confirmation about the avcodec.h part though.
> From 1eb424d688127c37b19765549b612977e760655c Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= <ubitux at gmail.com>
> Date: Mon, 7 Jan 2013 18:08:56 +0100
> Subject: [PATCH 2/2] lavc: support subtitles character encoding conversion.
>
> TODO: bump lavc micro
> ---
> Changelog | 1 +
> configure | 2 +
> libavcodec/avcodec.h | 18 ++++++++
> libavcodec/options_table.h | 1 +
> libavcodec/utils.c | 108 +++++++++++++++++++++++++++++++++++++++++++--
> 5 files changed, 127 insertions(+), 3 deletions(-)
>
> diff --git a/Changelog b/Changelog
> index 4a88e5a..24aeeea 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -18,6 +18,7 @@ version <next>:
> - il filter ported from libmpcodecs
> - support ID3v2 tags in ASF files
> - RF64 support in WAV muxer
> +- Subtitles character re-encoding
"character encoding conversion"?
>
>
> version 1.1:
> diff --git a/configure b/configure
> index 94aff58..1f6172e 100755
> --- a/configure
> +++ b/configure
> @@ -1390,6 +1390,7 @@ HAVE_LIST="
> gnu_as
> gsm_h
> ibm_asm
> + iconv
> inet_aton
> io_h
> isatty
> @@ -3715,6 +3716,7 @@ check_func getopt
> check_func getrusage
> check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
> check_func gettimeofday
> +check_func iconv
> check_func inet_aton $network_extralibs
> check_func isatty
> check_func localtime_r
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index fc7091c..4721087 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -3208,6 +3208,24 @@ typedef struct AVCodecContext {
> * - encoding: unused
> */
> AVDictionary *metadata;
> +
> + /**
> + * Character encoding of the input subtitles file.
> + * - decoding: set by user
> + * - encoding: unused
> + */
Missing remark about not accessing the field directly.
> + char *sub_charenc;
> +
> + /**
> + * Subtitles character encoding mode.
> + * - decoding: set by libavcodec, not intended to be used by user apps
I am very not sure about that: this field is intended, amongst other things,
for lavf if it does the conversion itself. From lavc's point of view, lavf
is a user app.
> + * - encoding: unused
> + */
> + int sub_charenc_mode;
> +#define FF_SUB_CHARENC_MODE_DO_NOTHING -1 ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance)
> +#define FF_SUB_CHARENC_MODE_AUTOMATIC 0 ///< libavcodec will select the mode itself
> +#define FF_SUB_CHARENC_MODE_DECODER_PRE 1 ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv
> +//#define FF_SUB_CHARENC_MODE_DECODER_POST 2 ///< the AVSubitle data needs to be recoded to UTF-8 after the decoder pass, requires iconv
?
> } AVCodecContext;
>
> AVRational av_codec_get_pkt_timebase (const AVCodecContext *avctx);
> diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
> index 33cb4b2..f27f5f0 100644
> --- a/libavcodec/options_table.h
> +++ b/libavcodec/options_table.h
> @@ -406,6 +406,7 @@ static const AVOption options[]={
> {"ka", "Karaoke", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE }, INT_MIN, INT_MAX, A|E, "audio_service_type"},
> {"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
> {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
> +{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
> {NULL},
> };
>
> diff --git a/libavcodec/utils.c b/libavcodec/utils.c
> index 2493798..13bc9ad 100644
> --- a/libavcodec/utils.c
> +++ b/libavcodec/utils.c
> @@ -48,6 +48,9 @@
> #include <stdarg.h>
> #include <limits.h>
> #include <float.h>
> +#if HAVE_ICONV
> +# include <iconv.h>
> +#endif
>
> volatile int ff_avcodec_locked;
> static int volatile entangled_thread_counter = 0;
> @@ -1068,6 +1071,34 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
> ret = AVERROR(EINVAL);
> goto free_and_end;
> }
> + if (avctx->sub_charenc) {
> + if (avctx->codec_type != AVMEDIA_TYPE_SUBTITLE) {
> + av_log(avctx, AV_LOG_ERROR, "Character encoding is only "
> + "supported with subtitles codecs\n");
> + ret = AVERROR(EINVAL);
> + goto free_and_end;
> + } else if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB) {
> + av_log(avctx, AV_LOG_WARNING, "Codec '%s' is bitmap-based, "
> + "subtitles character encoding will be ignored\n",
> + avctx->codec_descriptor->name);
> + avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
> + } else {
> + /* input character encoding is set for a text based subtitle
> + * codec at this point */
> + if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_AUTOMATIC)
> + avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DECODER_PRE;
> +
> + if (!HAVE_ICONV && avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_DECODER_PRE) {
> + av_log(avctx, AV_LOG_ERROR, "Character encoding subtitles "
> + "conversion needs a libavcodec built with iconv support "
> + "for this codec\n");
> + ret = AVERROR(ENOSYS);
> + goto free_and_end;
> + }
> + }
> + } else {
> + avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
> + }
> }
> end:
> ff_unlock_avcodec();
> @@ -1826,6 +1857,68 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
> return ret;
> }
>
> +#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
> +static int recode_subtitle(AVCodecContext *avctx,
> + AVPacket *outpkt, const AVPacket *inpkt)
> +{
> +#if HAVE_ICONV
> + iconv_t cd = (iconv_t)-1;
> + int ret = 0;
> + char *inb, *outb;
> + size_t inl, outl;
> + AVPacket tmp;
> +#endif
> +
> + if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_DECODER_PRE)
> + return 0;
> +
> +#if HAVE_ICONV
> + cd = iconv_open("UTF-8", avctx->sub_charenc);
> + if (cd == (iconv_t)-1) {
> + av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
> + "with input character encoding \"%s\"\n", avctx->sub_charenc);
> + ret = AVERROR(errno);
> + goto end;
> + }
> +
> + inb = inpkt->data;
> + inl = inpkt->size;
> +
> + if (inl >= INT_MAX / UTF8_MAX_BYTES - FF_INPUT_BUFFER_PADDING_SIZE) {
> + av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n");
> + ret = AVERROR(ENOMEM);
> + goto end;
> + }
> +
> + ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
> + if (ret < 0)
> + goto end;
> + outpkt->data = tmp.data;
> + outpkt->size = tmp.size;
> + outb = outpkt->data;
> + outl = outpkt->size;
> +
> + if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
> + iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1 ||
> + outl >= outpkt->size || inl != 0) {
> + av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
> + "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
> + av_free_packet(&tmp);
> + ret = AVERROR(errno);
> + goto end;
> + }
> + outpkt->size -= outl;
> + outpkt->data[outpkt->size - 1] = '\0';
> +
> +end:
> + if (cd != (iconv_t)-1)
> + iconv_close(cd);
> + return ret;
> +#else
> + av_assert0(!"requesting subtitles recoding without iconv");
> +#endif
> +}
> +
> int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
> int *got_sub_ptr,
> AVPacket *avpkt)
> @@ -1841,19 +1934,28 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
> avcodec_get_subtitle_defaults(sub);
>
> if (avpkt->size) {
> + AVPacket pkt_recoded;
> AVPacket tmp = *avpkt;
> int did_split = av_packet_split_side_data(&tmp);
> //apply_param_change(avctx, &tmp);
>
> - avctx->pkt = &tmp;
> + pkt_recoded = tmp;
> + ret = recode_subtitle(avctx, &pkt_recoded, &tmp);
> + if (ret < 0) {
> + *got_sub_ptr = 0;
> + } else {
> + avctx->pkt = &pkt_recoded;
>
> if (avctx->pkt_timebase.den && avpkt->pts != AV_NOPTS_VALUE)
> sub->pts = av_rescale_q(avpkt->pts,
> avctx->pkt_timebase, AV_TIME_BASE_Q);
> - ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &tmp);
> + ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &pkt_recoded);
> + if (tmp.data != pkt_recoded.data)
> + av_free(pkt_recoded.data);
> sub->format = !(avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB);
> -
> avctx->pkt = NULL;
> + }
> +
> if (did_split) {
> ff_packet_free_side_data(&tmp);
> if(ret == tmp.size)
The rest LGTM now, thanks for your efforts.
Once the iconv part is pushed, I will try to start working on reading text
files with various encodings.
Regards,
--
Nicolas George
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130216/7e21da72/attachment.asc>
More information about the ffmpeg-devel
mailing list