[FFmpeg-devel] [PATCH 2/2] lavc: support subtitles charset conversion.

Sat Feb 16 11:00:18 CET 2013

Le septidi 27 pluviôse, an CCXXI, Clement Boesch a écrit :
> Rebased on master. The two branches are now updated on my github. Also
> attached the two patches from the -nofilter one.

Thanks.

> From eca1faebbf3e21bf4925ae26f4ceb9eef4afb744 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= <ubitux at gmail.com>
> Date: Sat, 5 Jan 2013 11:06:31 +0100
> Subject: [PATCH 1/2] lavc: mark bitmap based subtitles codecs as such.

I am still not sure this is useful, but it looks correct. Wait for
confirmation about the avcodec.h part though.

> From 1eb424d688127c37b19765549b612977e760655c Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= <ubitux at gmail.com>
> Date: Mon, 7 Jan 2013 18:08:56 +0100
> Subject: [PATCH 2/2] lavc: support subtitles character encoding conversion.
> 
> TODO: bump lavc micro
> ---
>  Changelog                  |   1 +
>  configure                  |   2 +
>  libavcodec/avcodec.h       |  18 ++++++++
>  libavcodec/options_table.h |   1 +
>  libavcodec/utils.c         | 108 +++++++++++++++++++++++++++++++++++++++++++--
>  5 files changed, 127 insertions(+), 3 deletions(-)
> 
> diff --git a/Changelog b/Changelog
> index 4a88e5a..24aeeea 100644
> --- a/Changelog
> +++ b/Changelog
> @@ -18,6 +18,7 @@ version <next>:
>  - il filter ported from libmpcodecs
>  - support ID3v2 tags in ASF files
>  - RF64 support in WAV muxer

> +- Subtitles character re-encoding

"character encoding conversion"?

>  
>  
>  version 1.1:
> diff --git a/configure b/configure
> index 94aff58..1f6172e 100755
> --- a/configure
> +++ b/configure
> @@ -1390,6 +1390,7 @@ HAVE_LIST="
>      gnu_as
>      gsm_h
>      ibm_asm
> +    iconv
>      inet_aton
>      io_h
>      isatty
> @@ -3715,6 +3716,7 @@ check_func  getopt
>  check_func  getrusage
>  check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
>  check_func  gettimeofday
> +check_func  iconv
>  check_func  inet_aton $network_extralibs
>  check_func  isatty
>  check_func  localtime_r
> diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> index fc7091c..4721087 100644
> --- a/libavcodec/avcodec.h
> +++ b/libavcodec/avcodec.h
> @@ -3208,6 +3208,24 @@ typedef struct AVCodecContext {
>       * - encoding: unused
>       */
>      AVDictionary *metadata;
> +

> +    /**
> +     * Character encoding of the input subtitles file.
> +     * - decoding: set by user
> +     * - encoding: unused
> +     */

Missing remark about not accessing the field directly.

> +    char *sub_charenc;
> +
> +    /**
> +     * Subtitles character encoding mode.

> +     * - decoding: set by libavcodec, not intended to be used by user apps

I am very not sure about that: this field is intended, amongst other things,
for lavf if it does the conversion itself. From lavc's point of view, lavf
is a user app.

> +     * - encoding: unused
> +     */
> +    int sub_charenc_mode;
> +#define FF_SUB_CHARENC_MODE_DO_NOTHING  -1  ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance)
> +#define FF_SUB_CHARENC_MODE_AUTOMATIC    0  ///< libavcodec will select the mode itself
> +#define FF_SUB_CHARENC_MODE_DECODER_PRE  1  ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv

> +//#define FF_SUB_CHARENC_MODE_DECODER_POST 2  ///< the AVSubitle data needs to be recoded to UTF-8 after the decoder pass, requires iconv

?

>  } AVCodecContext;
>  
>  AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
> diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
> index 33cb4b2..f27f5f0 100644
> --- a/libavcodec/options_table.h
> +++ b/libavcodec/options_table.h
> @@ -406,6 +406,7 @@ static const AVOption options[]={
>  {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, "audio_service_type"},
>  {"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
>  {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
> +{"sub_charenc", "set input text subtitles character encoding", OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
>  {NULL},
>  };
>  
> diff --git a/libavcodec/utils.c b/libavcodec/utils.c
> index 2493798..13bc9ad 100644
> --- a/libavcodec/utils.c
> +++ b/libavcodec/utils.c
> @@ -48,6 +48,9 @@
>  #include <stdarg.h>
>  #include <limits.h>
>  #include <float.h>
> +#if HAVE_ICONV
> +# include <iconv.h>
> +#endif
>  
>  volatile int ff_avcodec_locked;
>  static int volatile entangled_thread_counter = 0;
> @@ -1068,6 +1071,34 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
>              ret = AVERROR(EINVAL);
>              goto free_and_end;
>          }
> +        if (avctx->sub_charenc) {
> +            if (avctx->codec_type != AVMEDIA_TYPE_SUBTITLE) {
> +                av_log(avctx, AV_LOG_ERROR, "Character encoding is only "
> +                       "supported with subtitles codecs\n");
> +                ret = AVERROR(EINVAL);
> +                goto free_and_end;
> +            } else if (avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB) {
> +                av_log(avctx, AV_LOG_WARNING, "Codec '%s' is bitmap-based, "
> +                       "subtitles character encoding will be ignored\n",
> +                       avctx->codec_descriptor->name);
> +                avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
> +            } else {
> +                /* input character encoding is set for a text based subtitle
> +                 * codec at this point */
> +                if (avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_AUTOMATIC)
> +                    avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DECODER_PRE;
> +
> +                if (!HAVE_ICONV && avctx->sub_charenc_mode == FF_SUB_CHARENC_MODE_DECODER_PRE) {
> +                    av_log(avctx, AV_LOG_ERROR, "Character encoding subtitles "
> +                           "conversion needs a libavcodec built with iconv support "
> +                           "for this codec\n");
> +                    ret = AVERROR(ENOSYS);
> +                    goto free_and_end;
> +                }
> +            }
> +        } else {
> +            avctx->sub_charenc_mode = FF_SUB_CHARENC_MODE_DO_NOTHING;
> +        }
>      }
>  end:
>      ff_unlock_avcodec();
> @@ -1826,6 +1857,68 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
>      return ret;
>  }
>  
> +#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
> +static int recode_subtitle(AVCodecContext *avctx,
> +                           AVPacket *outpkt, const AVPacket *inpkt)
> +{
> +#if HAVE_ICONV
> +    iconv_t cd = (iconv_t)-1;
> +    int ret = 0;
> +    char *inb, *outb;
> +    size_t inl, outl;
> +    AVPacket tmp;
> +#endif
> +
> +    if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_DECODER_PRE)
> +        return 0;
> +
> +#if HAVE_ICONV
> +    cd = iconv_open("UTF-8", avctx->sub_charenc);
> +    if (cd == (iconv_t)-1) {
> +        av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
> +               "with input character encoding \"%s\"\n", avctx->sub_charenc);
> +        ret = AVERROR(errno);
> +        goto end;
> +    }
> +
> +    inb = inpkt->data;
> +    inl = inpkt->size;
> +
> +    if (inl >= INT_MAX / UTF8_MAX_BYTES - FF_INPUT_BUFFER_PADDING_SIZE) {
> +        av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for recoding\n");
> +        ret = AVERROR(ENOMEM);
> +        goto end;
> +    }
> +
> +    ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
> +    if (ret < 0)
> +        goto end;
> +    outpkt->data = tmp.data;
> +    outpkt->size = tmp.size;
> +    outb = outpkt->data;
> +    outl = outpkt->size;
> +
> +    if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
> +        iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1 ||
> +        outl >= outpkt->size || inl != 0) {
> +        av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
> +               "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
> +        av_free_packet(&tmp);
> +        ret = AVERROR(errno);
> +        goto end;
> +    }
> +    outpkt->size -= outl;
> +    outpkt->data[outpkt->size - 1] = '\0';
> +
> +end:
> +    if (cd != (iconv_t)-1)
> +        iconv_close(cd);
> +    return ret;
> +#else
> +    av_assert0(!"requesting subtitles recoding without iconv");
> +#endif
> +}
> +
>  int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
>                               int *got_sub_ptr,
>                               AVPacket *avpkt)
> @@ -1841,19 +1934,28 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
>      avcodec_get_subtitle_defaults(sub);
>  
>      if (avpkt->size) {
> +        AVPacket pkt_recoded;
>          AVPacket tmp = *avpkt;
>          int did_split = av_packet_split_side_data(&tmp);
>          //apply_param_change(avctx, &tmp);
>  
> -        avctx->pkt = &tmp;
> +        pkt_recoded = tmp;
> +        ret = recode_subtitle(avctx, &pkt_recoded, &tmp);
> +        if (ret < 0) {
> +            *got_sub_ptr = 0;
> +        } else {
> +        avctx->pkt = &pkt_recoded;
>  
>          if (avctx->pkt_timebase.den && avpkt->pts != AV_NOPTS_VALUE)
>              sub->pts = av_rescale_q(avpkt->pts,
>                                      avctx->pkt_timebase, AV_TIME_BASE_Q);
> -        ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &tmp);
> +        ret = avctx->codec->decode(avctx, sub, got_sub_ptr, &pkt_recoded);
> +        if (tmp.data != pkt_recoded.data)
> +            av_free(pkt_recoded.data);
>          sub->format = !(avctx->codec_descriptor->props & AV_CODEC_PROP_BITMAP_SUB);
> -
>          avctx->pkt = NULL;
> +        }
> +
>          if (did_split) {
>              ff_packet_free_side_data(&tmp);
>              if(ret == tmp.size)

The rest LGTM now, thanks for your efforts.

Once the iconv part is pushed, I will try to start working on reading text
files with various encodings.

Regards,

-- 
  Nicolas George
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130216/7e21da72/attachment.asc>