[FFmpeg-devel] [PATCH 1/2] lavu: add text_file API.
wm4
nfxjfg at googlemail.com
Thu Aug 8 14:26:15 CEST 2013
On Thu, 8 Aug 2013 14:08:43 +0200
Nicolas George <nicolas.george at normalesup.org> wrote:
> TODO: version bump, APIChanges entry, !HAVE_ICONV path.
>
> Signed-off-by: Nicolas George <nicolas.george at normalesup.org>
> ---
> +static const char *const default_encodings[] = {
> + "UTF-8",
> + "US-ASCII",
> + "WINDOWS-1252",
> + "ISO-8859-1",
> + NULL
> +};
> +
> +static int try_encoding(AVTextFile *tf, const char *encoding)
> +{
> + iconv_t cd;
> + AVBPrint bp;
> + char *inbuf, *outbuf, *recoded;
> + size_t insize, outsize, insize_orig;
> + unsigned outsize_int;
> + int ret = 0;
> +
> + if ((cd = iconv_open("UTF-8", encoding)) == (iconv_t)-1)
> + return AVERROR(errno);
> + av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
> + inbuf = tf->full_data;
> + insize = tf->full_data_size;
> + while (insize) {
> + av_bprint_get_buffer(&bp, 512, (unsigned char **)&outbuf, &outsize_int);
> + if (outsize_int <= 1) {
> + ret = AVERROR(ENOMEM);
> + break;
> + }
> + outsize_int--;
> + outsize = outsize_int;
> + insize_orig = insize;
> + iconv(cd, &inbuf, &insize, &outbuf, &outsize);
> + if (insize == insize_orig) {
> + ret = AVERROR_INVALIDDATA;
> + break;
> + }
> + bp.len += outsize_int - outsize;
> + }
> + iconv_close(cd);
> + if (ret < 0) {
> + av_bprint_finalize(&bp, NULL);
> + return ret;
> + }
> + av_assert1(!insize);
> + bp.str[bp.len] = 0;
> + if ((ret = av_bprint_finalize(&bp, &recoded)) < 0)
> + return ret;
> + av_free(tf->full_data);
> + tf->full_data = recoded;
> + tf->full_data_size = bp.len;
> + tf->encoding = encoding;
> + return 0;
> +}
> +
> +static int guess_encoding(AVTextFile *tf)
> +{
> + const char *bom_encoding[2] = { NULL, NULL };
> + const char *const *encodings;
> + int ret, i;
> +
> + encodings = tf->encodings;
> + if (!encodings) {
> + for (i = 0; i < FF_ARRAY_ELEMS(byte_order_marks); i++) {
> + if (!memcmp(tf->full_data, byte_order_marks[i].bom,
> + byte_order_marks[i].len)) {
> + encodings = bom_encoding;
> + bom_encoding[0] = byte_order_marks[i].encoding;
> + break;
> + }
> + }
> + if (!encodings)
> + encodings = default_encodings;
> + }
> +
> + for (i = 0; encodings[i]; i++)
> + if ((ret = try_encoding(tf, encodings[i])) >= 0)
> + return ret;
> +
> + av_strlcpy(tf->error, "Unable to guess character encoding",
> + sizeof(tf->error));
> + return AVERROR_INVALIDDATA;
> +}
I assume this is for subtitle support.
There are so many libraries which try to auto-detect encodings using
elaborate statistical methods etc., and they all fail sometimes in one
way or another - and this is supposed to be sufficient? How do you even
distinguish these 8-bit codepage encodings?
How can an application do its own auto-detection?
More information about the ffmpeg-devel
mailing list