[FFmpeg-devel] [PATCH] lavc: support subtitles charset conversion.

Mon Dec 31 11:54:47 CET 2012

On Mon, Dec 31, 2012 at 11:37:33AM +0100, Nicolas George wrote:
> Le primidi 11 nivôse, an CCXXI, Clement Boesch a écrit :
> > ---
> >  Changelog                  |  1 +
> >  configure                  |  2 ++
> >  libavcodec/avcodec.h       |  7 +++++
> >  libavcodec/options_table.h |  1 +
> >  libavcodec/utils.c         | 65 +++++++++++++++++++++++++++++++++++++++++++++-
> >  5 files changed, 75 insertions(+), 1 deletion(-)
> 
> I am sorry, but this will not work. I have on occasion found srt files in
> UCS-2/UTF-16 (with byte order mark, so totally recognizable). Since this
> encoding is not compatible with ASCII, recoding after the subtitle has been
> stored in the AVSubtitleRect.ass field is not possible.
> 
> Of course, currently these files just fail to parse, but we will probably
> want to change that at some point¹. Also, I suspect there are a few muxed
> files with S_TEXT or equivalent streams in ASCII-incompatible encodings.
> 
> I am not saying that this patch should be completely trashed, but it needs a
> lot more of forethought.
> 

Yes, the UTF-16 is not supported; I assumed ASCII compatible input,
because the current text subtitles decoder only support that. If we want
to support UTF-16 in some decoders, shouldn't we just change ff_get_line()
(and the manual read_chunk function) to support a convert from UTF-16 to
UTF-8? That way, anything related to text subtitles will just work with
classic strings. And this patch as well.

> 
> 1: it might be pretty easy, in fact: introduce and use ff_read_text_file
> that returns an array of lines and does all the autodetection and recoding.
> 
> > diff --git a/Changelog b/Changelog
> > index b0a0de5..f2bc343 100644
> > --- a/Changelog
> > +++ b/Changelog
> > @@ -52,6 +52,7 @@ version <next>:
> >  - NIST Sphere demuxer
> >  - MPL2, VPlayer, MPlayer, AQTitle, PJS and SubViewer v1 subtitles demuxers and decoders
> >  - Sony Wave64 muxer
> 
> > +- Subtitles charset conversion
> 
> AFAIK, "charset" is invalid. Technically, UTF-8 and UCS-4 are the same
> character *set*, Unicode, and they only differ in their byte encoding; the
> correct name would be "character encoding", but this is a bit long.
> 
> (For searchability, "charset" should probably be present in the docs,
> though, along with "codepage" for microsoft-addicts.)
> 

iconv is said to be a "character set converter" so I kept that vocabulary.
I'll try to be more rigorous in the next patch, thanks.

> >  
> >  
> >  version 1.0:
> > diff --git a/configure b/configure
> > index faea887..0360fc9 100755
> > --- a/configure
> > +++ b/configure
> > @@ -1380,6 +1380,7 @@ HAVE_LIST="
> >      glob
> >      gnu_as
> >      ibm_asm
> > +    iconv
> >      inet_aton
> >      io_h
> >      isatty
> > @@ -3695,6 +3696,7 @@ check_func  getopt
> >  check_func  getrusage
> >  check_struct "sys/time.h sys/resource.h" "struct rusage" ru_maxrss
> >  check_func  gettimeofday
> > +check_func  iconv
> >  check_func  inet_aton $network_extralibs
> >  check_func  isatty
> >  check_func  localtime_r
> > diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
> > index 012a31c..7a3761d 100644
> > --- a/libavcodec/avcodec.h
> > +++ b/libavcodec/avcodec.h
> > @@ -3188,6 +3188,13 @@ typedef struct AVCodecContext {
> >       * - encoding: unused
> >       */
> >      AVDictionary *metadata;
> > +
> > +    /**
> > +     * Charset of the input subtitles file.
> > +     * - decoding: set by user
> > +     * - encoding: unused
> > +     */
> > +    char *sub_charset;
> >  } AVCodecContext;
> >  
> >  AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
> > diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
> > index 2a31fa6..38d493d 100644
> > --- a/libavcodec/options_table.h
> > +++ b/libavcodec/options_table.h
> > @@ -403,6 +403,7 @@ static const AVOption options[]={
> >  {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, "audio_service_type"},
> >  {"request_sample_fmt", "sample format audio decoders should prefer", OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, {.i64=AV_SAMPLE_FMT_NONE}, -1, AV_SAMPLE_FMT_NB-1, A|D, "request_sample_fmt"},
> >  {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 }, 0, INT_MAX, 0},
> > +{"sub_charset", "set input text subtitles charset", OFFSET(sub_charset), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, S|D},
> >  {NULL},
> >  };
> >  
> > diff --git a/libavcodec/utils.c b/libavcodec/utils.c
> > index 918954a..a9a5168 100644
> > --- a/libavcodec/utils.c
> > +++ b/libavcodec/utils.c
> > @@ -47,6 +47,9 @@
> >  #include <stdarg.h>
> >  #include <limits.h>
> >  #include <float.h>
> > +#if HAVE_ICONV
> > +# include <iconv.h>
> > +#endif
> >  
> >  volatile int ff_avcodec_locked;
> >  static int volatile entangled_thread_counter = 0;
> > @@ -1058,6 +1061,12 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
> >              ret = AVERROR(EINVAL);
> >              goto free_and_end;
> >          }
> > +        if (!HAVE_ICONV && avctx->codec_type == AVMEDIA_TYPE_SUBTITLE && avctx->sub_charset) {
> > +            av_log(avctx, AV_LOG_ERROR, "Charset subtitles conversion needs "
> > +                   "a libavcodec built with iconv support.\n");
> > +            ret = AVERROR(EINVAL);
> > +            goto free_and_end;
> > +        }
> >      }
> >  end:
> >      ff_unlock_avcodec();
> > @@ -1816,6 +1825,56 @@ int attribute_align_arg avcodec_decode_audio4(AVCodecContext *avctx,
> >      return ret;
> >  }
> >  
> > +#if HAVE_ICONV
> > +static int recode_subtitle(AVCodecContext *avctx, AVSubtitle *sub)
> > +{
> > +    int i, ret = 0;
> > +    iconv_t cd;
> > +
> > +    if (!avctx->sub_charset || sub->format == 0)
> > +        return 0;
> > +
> 
> > +    cd = iconv_open("UTF-8", avctx->sub_charset);
> > +    if (cd == (iconv_t)-1) {
> > +        av_log(avctx, AV_LOG_ERROR, "Unable to open iconv context "
> > +               "with input charset \"%s\"\n", avctx->sub_charset);
> > +        return AVERROR_EXTERNAL;
> > +    }
> 
> Opening a new iconv for each event may be quite expensive.
> 

Yes; this is relatively inefficient currently. A possible way of doing
this is to avctx->conv_ctx = malloc(sizeof(iconv_t)), but I went for the
simplest solution at first.

> > +
> > +    for (i = 0; i < sub->num_rects; i++) {
> > +        char *inb, *outb, *inb_start, *outb_start;
> > +        size_t inl, outl;
> > +
> > +        inb = inb_start = sub->rects[i]->ass;
> > +        if (!inb)
> > +            break;
> 
> > +        inl  = strlen(inb);
> > +        outl = 4*inl;
> 
> Possible overflow (but very unlikely).
> 
> Also, what is that 4*? UTF-8 can use up to 6 bytes per codepoint. Enlarging
> the buffer in case of overflow may be AVERROR_PATCHWELCOME.
> 

Mmh, I quickly scrolled down the UTF-8 wiki page, saw 1 to 4 bytes in the
examples, and thought it was the maximum. I guess 6*inl would do…

> > +        outb = outb_start = av_malloc(outl + 1);
> > +        if (!outb) {
> > +            ret = AVERROR(ENOMEM);
> > +            break;
> > +        }
> > +
> > +        if (iconv(cd, &inb, &inl, &outb, &outl) == (size_t)-1 ||
> > +            iconv(cd, NULL, NULL, &outb, &outl) == (size_t)-1) {
> > +            av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
> > +                   "from %s to UTF-8\n", inb_start, avctx->sub_charset);
> > +            av_free(outb_start);
> 
> > +            ret = AVERROR_EXTERNAL;
> 
> AVERROR(errno)?
> 

Ah, true.

> > +            break;
> > +        }
> > +        *outb = 0;
> > +
> > +        av_free(inb_start);
> > +        sub->rects[i]->ass = outb_start;
> > +    }
> > +
> > +    iconv_close(cd);
> > +    return ret;
> > +}
> > +#endif
> > +
> >  int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
> >                               int *got_sub_ptr,
> >                               AVPacket *avpkt)
> > @@ -1850,8 +1909,12 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
> >                  ret = avpkt->size;
> >          }
> >  
> > -        if (*got_sub_ptr)
> > +        if (*got_sub_ptr) {
> > +#if HAVE_ICONV
> > +            ret = recode_subtitle(avctx, sub);
> > +#endif
> >              avctx->frame_number++;
> > +        }
> >      }
> >  
> >      return ret;
> 
> Regards,
> 

-- 
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 490 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20121231/6531cba7/attachment.asc>