[FFmpeg-devel] [PATCH 2/2] lavc: support subtitles charset conversion.
Nicolas George
nicolas.george at normalesup.org
Tue Jan 8 12:29:36 CET 2013
Thanks for the new version. Only technical points now:
> I don't understand what you mean here: -sub_charenc is used to specify the
> character encoding of the input, so using -sub_charenc copy would mean
For now, sub_charenc is only for input, but at some point I expect someone
will address the character encoding (and LF/CRLF status) of output subtitles
files too. That is not very important though.
L'octidi 18 nivôse, an CCXXI, Clement Boesch a écrit :
> TODO: bump lavc micro
> ---
> Changelog | 2 +
> configure | 2 +
> libavcodec/avcodec.h | 18 ++++++++
> libavcodec/options_table.h | 1 +
> libavcodec/utils.c | 108 +++++++++++++++++++++++++++++++++++++++++--
> libavformat/assdec.c | 1 +
> libavformat/tedcaptionsdec.c | 1 +
> libavformat/webvttdec.c | 1 +
> 8 files changed, 131 insertions(+), 3 deletions(-)
>
> diff --git a/Changelog b/Changelog
> index 1a2e96a..42a38c8 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -3,6 +3,8 @@ releases are sorted from youngest to oldest.
>
> version <next>:
>
> +- Subtitles character re-encoding
> +
>
> version 1.1:
>
> diff --git a/configure b/configure
> index 89e076f..1b119e8 100755
> --- a/configure
> +++ b/configure
> @@ -1380,6 +1380,7 @@ HAVE_LIST="
> glob
> gnu_as
> ibm_asm
> + iconv
> inet_aton
> io_h
> isatty
> @@ -3698,6 +3699,7 @@ check_func getopt
> check_func getrusage
> check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
> check_func gettimeofday
> +check_func iconv
> check_func inet_aton $network_extralibs
> check_func isatty
> check_func localtime_r
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index 62f5474..be62fc8 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -3195,6 +3195,24 @@ typedef struct AVCodecContext {
> * - encoding: unused
> */
> AVDictionary *metadata;
> +
> + /**
> + * Character encoding of the input subtitles file.
> + * - decoding: set by user, libavformat or libavcodec
> + * - encoding: unused
> + */
> + char *sub_charenc;
Needs a not about not accessing it directly from outside lavc (and possibly
an accessor).
> +
> + /**
> + * Subtitles character encoding mode.
> + * - decoding: set by libavformat or libavcodec, not intended to be used by user apps
Do we consider lavf mandatory for subtitles demuxing? IMHO, from lavc's
point of view, lavf is a "user app".
> + * - encoding: unused
> + */
> + int sub_charenc_mode;
> +#define FF_SUB_CHARENC_MODE_DO_NOTHING -1 ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance)
> +#define FF_SUB_CHARENC_MODE_AUTOMATIC 0 ///< libavcodec will select the mode itself
> +#define FF_SUB_CHARENC_MODE_DECODER_PRE 2 ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv
> +//#define FF_SUB_CHARENC_MODE_DECODER_POST 3 ///< the AVSubitle data needs to be recoded to UTF-8 after the decoder pass, requires iconv
Nit: enum?
> } AVCodecContext;
>
> AVRational av_codec_get_pkt_timebase (const AVCodecContext *avctx);
> diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
> index 2a31fa6..75c1961 100644
> --- a/libavcodec/options_table.h
> +++ b/libavcodec/options_table.h
> @@ -403,6 +403,7 @@ static const AVOption options[]={
> {"ka", "Karaoke", 0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE }, INT_MIN, INT_MAX, A|E, "audio_service_type"},
> {"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
> {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
> +{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
> {NULL},
> };
>
> diff --git a/libavcodec/utils.c b/libavcodec/utils.c
> index 99b2202..3e2326e 100644
> --- a/libavcodec/utils.c
> +++ b/libavcodec/utils.c
> @@ -47,6 +47,9 @@
> #include <stdarg.h>
> #include <limits.h>
> #include <float.h>
> +#if HAVE_ICONV
> +# include <iconv.h>
> +#endif
>
> volatile int ff_avcodec_locked;
> static int volatile entangled_thread_counter = 0;
> @@ -1058,6 +1061,34 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
> ret = AVERROR(EINVAL);
> goto free_and_end;
> }
> + if (avctx->sub_charenc) {
> + if (avctx->codec_type != AVMEDIA_TYPE_SUBTITLE) {
> + av_log(avctx, AV_LOG_ERROR, "Character encoding is only "
> + "supported with subtitles codecs\n");
> + ret = AVERROR(EINVAL);
> + goto free_and_end;
> + } else if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB) {
> + av_log(avctx, AV_LOG_WARNING, "Codec '%s' is bitmap-based, "
> + "subtitles character encoding will be ignored\n",
> + avctx->codec_descriptor->name);
> + avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
> + } else {
> + /* input character encoding is set for a text based subtitle
> + * codec at this point */
> + if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_AUTOMATIC)
> + avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DECODER_PRE;
> +
> + if (!HAVE_ICONV && avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_DECODER_PRE) {
> + av_log(avctx, AV_LOG_ERROR, "Character encoding subtitles "
> + "conversion needs a libavcodec built with iconv support "
> + "for this codec\n");
> + ret = AVERROR(ENOSYS);
> + goto free_and_end;
> + }
> + }
> + } else {
> + avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
> + }
> }
> end:
> ff_unlock_avcodec();
> @@ -1816,6 +1847,68 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
> return ret;
> }
>
> +#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
> +static int recode_subtitle(AVCodecContext *avctx,
> + AVPacket *outpkt, const AVPacket *inpkt)
> +{
> +#if HAVE_ICONV
> + iconv_t cd = (iconv_t)-1;
> + int ret = 0;
> + char *inb, *outb;
> + size_t inl, outl;
> + AVPacket tmp;
> +#endif
> +
> + if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_DECODER_PRE)
> + return 0;
> +
> +#if HAVE_ICONV
> + cd = iconv_open("UTF-8", avctx->sub_charenc);
> + if (cd == (iconv_t)-1) {
> + av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
> + "with input character encoding \"%s\"\n", avctx->sub_charenc);
> + ret = AVERROR(errno);
> + goto end;
> + }
> +
> + inb = inpkt->data;
> + inl = inpkt->size;
> +
> + if (inl >= INT_MAX / UTF8_MAX_BYTES - FF_INPUT_BUFFER_PADDING_SIZE) {
> + av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n");
> + ret = AVERROR(ENOMEM);
> + goto end;
> + }
> +
> + ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
> + if (ret < 0)
> + goto end;
> + outpkt->data = tmp.data;
> + outpkt->size = tmp.size;
> + outb = outpkt->data;
> + outl = outpkt->size;
> +
> + if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
> + iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1 ||
> + outl >= outpkt->size || inl != 0) {
> + av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
> + "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
> + av_free_packet(&tmp);
> + ret = AVERROR(errno);
> + goto end;
> + }
> + outpkt->size -= outl;
> + outpkt->data[outpkt->size - 1] = '\0';
> +
> +end:
> + if (cd != (iconv_t)-1)
> + iconv_close(cd);
> + return ret;
> +#else
> + av_assert0(!"requesting subtitles recoding without iconv");
> +#endif
> +}
> +
> int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
> int *got_sub_ptr,
> AVPacket *avpkt)
> @@ -1831,19 +1924,28 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
> avcodec_get_subtitle_defaults(sub);
>
> if (avpkt->size) {
> + AVPacket pkt_recoded;
> AVPacket tmp = *avpkt;
> int did_split = av_packet_split_side_data(&tmp);
> //apply_param_change(avctx, &tmp);
>
> - avctx->pkt = &tmp;
> + pkt_recoded = tmp;
> + ret = recode_subtitle(avctx, &pkt_recoded, &tmp);
> + if (ret < 0) {
> + *got_sub_ptr = 0;
> + } else {
> + avctx->pkt = &pkt_recoded;
>
> if (avctx->pkt_timebase.den && avpkt->pts != AV_NOPTS_VALUE)
> sub->pts = av_rescale_q(avpkt->pts,
> avctx->pkt_timebase, AV_TIME_BASE_Q);
> - ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &tmp);
> + ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &pkt_recoded);
> + if (tmp.data != pkt_recoded.data)
> + av_free(pkt_recoded.data);
> sub->format = !(avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB);
> -
> avctx->pkt = NULL;
> + }
> +
> if (did_split) {
> ff_packet_free_side_data(&tmp);
> if(ret == tmp.size)
> diff --git a/libavformat/assdec.c b/libavformat/assdec.c
> index 35fcb51..710a171 100644
> --- a/libavformat/assdec.c
> +++ b/libavformat/assdec.c
> @@ -93,6 +93,7 @@ static int ass_read_header(AVFormatContext *s)
> avpriv_set_pts_info(st, 64, 1, 100);
> st->codec->codec_type = AVMEDIA_TYPE_SUBTITLE;
> st->codec->codec_id= AV_CODEC_ID_SSA;
> + st->codec->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
lavf should not access the final AVCodecContext fields directly. Also,
usually, FF_* macros are for intra-use only (but there are exceptions).
Another problem: at this point, sub_charenc is still NULL, and therefore
DO_NOTHING is redundant. If this code path leads to sub_charenc being
non-NULL, that means it has been set by the user: ignoring it seems wrong.
IMHO, the demuxer should only set the mode to DO_NOTHING if it actually does
something with the encoding.
(ASS files in something else than UTF-8 exist; I am sure malformed webftt
files exist too somewhere.)
>
> header_remaining= INT_MAX;
>
> diff --git a/libavformat/tedcaptionsdec.c b/libavformat/tedcaptionsdec.c
> index 85bed0a..69d0d17 100644
> --- a/libavformat/tedcaptionsdec.c
> +++ b/libavformat/tedcaptionsdec.c
> @@ -296,6 +296,7 @@ static av_cold int tedcaptions_read_header(AVFormatContext *avf)
> return AVERROR(ENOMEM);
> st->codec->codec_type = AVMEDIA_TYPE_SUBTITLE;
> st->codec->codec_id = CODEC_ID_TEXT;
> + st->codec->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
> avpriv_set_pts_info(st, 64, 1, 1000);
> st->probe_packets = 0;
> st->start_time = 0;
> diff --git a/libavformat/webvttdec.c b/libavformat/webvttdec.c
> index 694a020..42d8162 100644
> --- a/libavformat/webvttdec.c
> +++ b/libavformat/webvttdec.c
> @@ -66,6 +66,7 @@ static int webvtt_read_header(AVFormatContext *s)
> avpriv_set_pts_info(st, 64, 1, 1000);
> st->codec->codec_type = AVMEDIA_TYPE_SUBTITLE;
> st->codec->codec_id = AV_CODEC_ID_WEBVTT;
> + st->codec->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
>
> av_bprint_init(&header, 0, AV_BPRINT_SIZE_UNLIMITED);
> av_bprint_init(&cue, 0, AV_BPRINT_SIZE_UNLIMITED);
Regards,
--
Nicolas George
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130108/ea627277/attachment.asc>
More information about the ffmpeg-devel
mailing list