[FFmpeg-devel] [PATCH 2/5] lavu: add text_file API.

Sat Aug 10 14:48:41 CEST 2013

On Thu,  8 Aug 2013 17:03:32 +0200
Nicolas George <nicolas.george at normalesup.org> wrote:

> TODO: version bump, APIChanges entry
> 
> Signed-off-by: Nicolas George <nicolas.george at normalesup.org>
> ---
>  libavutil/Makefile    |    1 +
>  libavutil/text_file.c |  284 +++++++++++++++++++++++++++++++++++++++++++++++++
>  libavutil/text_file.h |  181 +++++++++++++++++++++++++++++++
>  3 files changed, 466 insertions(+)
>  create mode 100644 libavutil/text_file.c
>  create mode 100644 libavutil/text_file.h
> 
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 21746f0..7d59a73 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -107,6 +107,7 @@ OBJS = adler32.o                                                        \
>         samplefmt.o                                                      \
>         sha.o                                                            \
>         sha512.o                                                         \
> +       text_file.o                                                      \
>         time.o                                                           \
>         timecode.o                                                       \
>         tree.o                                                           \
> diff --git a/libavutil/text_file.c b/libavutil/text_file.c
> new file mode 100644
> index 0000000..7d6c35d
> --- /dev/null
> +++ b/libavutil/text_file.c
> @@ -0,0 +1,284 @@
> +/*
> + * Copyright (c) 2013 Nicolas George
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public License
> + * as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public License
> + * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "avassert.h"
> +#include "avstring.h"
> +#include "bprint.h"
> +#include "internal.h"
> +#include "text_file.h"
> +
> +#define COPY_FROM_USER(var) \
> +    av_assert0(var ## _user->struct_size <= sizeof(var)); \
> +    memcpy(&var, var ## _user, var ## _user->struct_size);
> +#define COPY_TO_USER(var) \
> +    memcpy(var ## _user, &var, var ## _user->struct_size);
> +
> +static const struct {
> +    unsigned char encoding[9], bom[4], len;
> +} byte_order_marks[] = {
> +    { "UTF-8",    "\xef\xbb\xbf",     3 },
> +    { "UCS-4BE",  "\x00\x00\xfe\xff", 4 },
> +    { "UCS-4LE",  "\xff\xfe\x00\x00", 4 },
> +    { "UTF-16BE", "\xfe\xff",         2 },
> +    { "UTF-16LE", "\xff\xfe",         2 },
> +};
> +
> +static const char *const default_encodings[] = {
> +    "UTF-8",
> +    "US-ASCII",
> +    "WINDOWS-1252",
> +    "ISO-8859-1",
> +    NULL
> +};

Makes no sense, see below.

> +#if CONFIG_ICONV
> +
> +#include <iconv.h>
> +
> +static int try_encoding(AVTextFile *tf, const char *encoding)
> +{
> +    iconv_t cd;
> +    AVBPrint bp;
> +    char *inbuf, *outbuf, *recoded;
> +    size_t insize, outsize, insize_orig;
> +    unsigned outsize_int;
> +    int ret = 0;
> +
> +    if ((cd = iconv_open("UTF-8", encoding)) == (iconv_t)-1)
> +        return AVERROR(errno);
> +    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
> +    inbuf  = tf->full_data;
> +    insize = tf->full_data_size;
> +    while (insize) {
> +        av_bprint_get_buffer(&bp, 512, (unsigned char **)&outbuf, &outsize_int);
> +        if (outsize_int <= 1) {
> +            ret = AVERROR(ENOMEM);
> +            break;
> +        }
> +        outsize_int--;
> +        outsize = outsize_int;
> +        insize_orig = insize;
> +        iconv(cd, &inbuf, &insize, &outbuf, &outsize);
> +        if (insize == insize_orig) {
> +            ret = AVERROR_INVALIDDATA;
> +            break;
> +        }
> +        bp.len += outsize_int - outsize;
> +    }
> +    iconv_close(cd);
> +    if (ret < 0) {
> +        av_bprint_finalize(&bp, NULL);
> +        return ret;
> +    }
> +    av_assert1(!insize);
> +    bp.str[bp.len] = 0;
> +    if ((ret = av_bprint_finalize(&bp, &recoded)) < 0)
> +        return ret;
> +    av_free(tf->full_data);
> +    tf->full_data      = recoded;
> +    tf->full_data_size = bp.len;
> +    tf->encoding       = encoding;
> +    return 0;
> +}
> +
> +#else
> +
> +static int try_encoding(AVTextFile *tf, const char *encoding)
> +{
> +    if (av_strcasecmp(encoding, "UTF-8")) {
> +        av_strlcpy(tf->error, "Character encoding conversion not supported, "
> +                   "enable iconv at compilation", sizeof(tf->error));
> +        return AVERROR(ENOSYS);
> +    }
> +    int r = avpriv_utf8_check(tf->full_data, tf->full_data_size);
> +    av_log(0, 16, "r = %d\n", r);
> +    return avpriv_utf8_check(tf->full_data, tf->full_data_size) ? 0 :
> +           AVERROR_INVALIDDATA;
> +}
> +
> +#endif
> +
> +static int guess_encoding(AVTextFile *tf)
> +{
> +    const char *bom_encoding[2] = { NULL, NULL };
> +    const char *const *encodings;
> +    int ret, i;
> +
> +    encodings = tf->encodings;
> +    if (!encodings) {
> +        for (i = 0; i < FF_ARRAY_ELEMS(byte_order_marks); i++) {
> +            if (!memcmp(tf->full_data, byte_order_marks[i].bom,
> +                                       byte_order_marks[i].len)) {
> +                encodings = bom_encoding;
> +                bom_encoding[0] = byte_order_marks[i].encoding;
> +                break;
> +            }
> +        }
> +        if (!encodings)
> +            encodings = default_encodings;
> +    }
> +
> +    ret = AVERROR_INVALIDDATA;
> +    for (i = 0; encodings[i]; i++)
> +        if ((ret = try_encoding(tf, encodings[i])) >= 0)
> +            return ret;
> +
> +    if (!*tf->error)
> +        av_strlcpy(tf->error, "Unable to guess character encoding",
> +                   sizeof(tf->error));
> +    return ret;
> +}

This does not work. You're trying to detect legacy 8-bit codepage
encodings by testing whether conversion to iconv works.
For example, as far as I remember, ISO-8859-1 is at best a
subset of WINDOWS-1252, so your code will never detect ISO-8859-1.

This kind of detection will most likely lead to broken text, which is
silently accepted without even printing a warning anywhere.

Additionally, the application (and not even a ffmpeg.c user) has the
slightest control over how character set detection and conversion is
handled, so this automagic fails to do anything useful and is strictly
a disimprovement over the previous code. (Except that it can read
UTF-16 encoded files now.)

> +static void remove_cr(AVTextFile *tf)
> +{
> +    uint8_t *p, *q, *end;
> +
> +    p = q = tf->text;
> +    end = p + tf->text_size;
> +    for (; p < end; p++)
> +        if (*p != '\r' || p[1] != '\n')
> +            *(q++) = *p;
> +    tf->text_size = q - tf->text;
> +    *(q++) = 0;
> +}
> +
> +static int split_lines(AVTextFile *tf)
> +{
> +    size_t i, nb_lines = 0;
> +    uint8_t *p, *end = tf->text + tf->text_size;
> +
> +    if (tf->text_size) {
> +        nb_lines++;
> +        for (p = tf->text; p < end - 1; p++)
> +            if (*p == '\n')
> +                nb_lines++;
> +    }
> +    tf->lines = av_calloc(nb_lines + 1, sizeof(*tf->lines));
> +    tf->lines[0] = p = tf->text;

Not checking for av_calloc result.

> +    for (i = 1; i < nb_lines; i++) {
> +        p = memchr(p, '\n', end - p);
> +        av_assert1(p);
> +        *p = 0;
> +        tf->lines[i] = ++p;
> +    }
> +    if (tf->text_size) {
> +        if ((p = memchr(p, '\n', end - p))) {
> +            av_assert1(p == end - 1);
> +            *p = 0;
> +        } else {
> +            tf->text_flags |= AV_TEXT_FLAG_NO_EOL;
> +        }
> +    }
> +    tf->nb_lines = nb_lines;
> +    return 0;
> +}

Why does it create an array of lines? This seems slow, complex, and
unnecessary. Separate \n and \r processing also prevents you from
treating MacOS line endings correctly.

> +static int text_file_process(AVTextFile *tf)
> +{
> +    int ret;
> +
> +    tf->text_flags = 0;
> +    if ((ret = guess_encoding(tf)) < 0)
> +        return ret;
> +    tf->text      = tf->full_data;
> +    tf->text_size = tf->full_data_size;
> +
> +    if (!memcmp(tf->text, byte_order_marks[0].bom, 3)) {
> +        tf->text_size  -= 3;
> +        tf->text       += 3;
> +        tf->text_flags |= AV_TEXT_FLAG_HAS_BOM;
> +    }
> +
> +    if ((tf->flags & AV_TEXT_FLAG_REMOVE_CR))
> +        remove_cr(tf);
> +    if ((tf->flags & AV_TEXT_FLAG_SPLIT_LINES))
> +        if ((ret = split_lines(tf)) < 0)
> +            return ret;
> +
> +    return 0;
> +}
> +
> +static int text_file_try_read(AVTextFile *tf,
> +                              AVTextFileRead callback, void *opaque)
> +{
> +    AVBPrint bp;
> +    unsigned buf_size;
> +    uint8_t *buf;
> +    int ret;
> +
> +    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
> +    while (1) {
> +        av_bprint_get_buffer(&bp, 512, &buf, &buf_size);
> +        if (buf_size <= 1) {
> +            ret = AVERROR(ENOMEM);
> +            break;
> +        }
> +        ret = callback(opaque, buf, FFMIN(buf_size - 1, INT_MAX));
> +        if (ret < 0) {
> +            if (ret == AVERROR_EOF)
> +                ret = 0;
> +            break;
> +        }
> +        bp.len += ret;
> +    }
> +
> +    if (ret < 0) {
> +        av_bprint_finalize(&bp, NULL);
> +        return ret;
> +    }
> +    if ((ret = av_bprint_finalize(&bp, (char **)&tf->full_data)) < 0)
> +        return ret;
> +    tf->full_data_size = bp.len;
> +    return text_file_process(tf);
> +}
> +
> +static int text_file_read(AVTextFile *tf,
> +                          AVTextFileRead callback, void *opaque)
> +{
> +    int ret;
> +
> +    *tf->error = 0;
> +    if ((ret = text_file_try_read(tf, callback, opaque)) < 0) {
> +        if (!*tf->error)
> +            av_strerror(ret, tf->error, sizeof(tf->error));
> +        av_text_file_free(tf);
> +    }
> +    return ret;
> +}
> +
> +void av_text_file_free(AVTextFile *tf)
> +{
> +    tf->text = NULL;
> +    av_freep(&tf->lines);
> +    av_freep(&tf->full_data);
> +    tf->text_size = tf->full_data_size = 0;
> +}
> +
> +int av_text_file_read_callback(AVTextFile *tf_user,
> +                               AVTextFileRead callback, void *opaque)
> +{
> +    AVTextFile tf = { 0 };
> +    int ret;
> +
> +    COPY_FROM_USER(tf);
> +    ret = text_file_read(&tf, callback, opaque);
> +    COPY_TO_USER(tf);
> +    return ret;
> +}

Why does this use its own callback facility instead of using avio?

> diff --git a/libavutil/text_file.h b/libavutil/text_file.h
> new file mode 100644
> index 0000000..d1bfbd3
> --- /dev/null
> +++ b/libavutil/text_file.h
> @@ -0,0 +1,181 @@
> +/*
> + * Copyright (c) 2013 Nicolas George
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public License
> + * as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public License
> + * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_TEXT_FILE_H
> +#define AVUTIL_TEXT_FILE_H
> +
> +#include "common.h"
> +
> +/**
> + * Structure to help read text files.
> + * This API allows to read text files (or other ways of storing text) while
> + * handling the subtleties of character encodings, end-of-line separators,
> + * etc.
> + *
> + * The text returned by this function is always recoded to UTF-8.
> + *
> + * The typical way of using this API is to declare a AVTextFile variable
> + * with the default initialization macro:
> + * AVTextFile tf = { AV_TEXT_FILE_DEFAULT };
> + * Then set fields to control various parts of the process and use it with
> + * the API functions.
> + */
> +typedef struct AVTextFile {
> +
> +    /**
> +     * Size of the structure; must be set to sizeof(AVTextFile) to ensure
> +     * compatibility with later versions of the library.
> +     */
> +
> +    size_t struct_size;
> +
> +    /**
> +     * Text read from the file. Always terminated by an additional 0.
> +     */
> +    uint8_t *text;
> +
> +    /**
> +     * Size of text, in bytes, not counting the additional terminating 0.
> +     */
> +    size_t text_size;
> +
> +    /**
> +     * Full data buffer containing the text; must be freed with av_free()
> +     * when no longer needed. Can be different from text due to details such
> +     * as byte-order-marks.
> +     */
> +    uint8_t *full_data;
> +
> +    /**
> +     * Size of full_data, in bytes, not counting the additional 0.
> +     */
> +    size_t full_data_size;
> +
> +    /**
> +     * Detected encoding; will point to either a static string or an element
> +     * of the encodings field.
> +     */
> +    const char *encoding;
> +
> +    /**
> +     * List of encodings for audodetection, terminated by NULL.
> +     * The first encoding in this list that can apply to the file is used.
> +     */
> +    const char *const *encodings;
> +
> +    /**
> +     * Lines of the file; only relevant if AV_TEXT_FLAG_SPLIT_LINES is set.
> +     * Terminated by an additional NULL pointer.
> +     */
> +    char **lines;
> +
> +    /**
> +     * Number of elements in the lines array, not counting the additional
> +     * NULL.
> +     */
> +    size_t nb_lines;
> +
> +    /**
> +     * Flags to control the processing of the file. See the AV_TEXT_FLAG_*
> +     * constants below.
> +     */
> +    unsigned flags;
> +
> +    /**
> +     * Flags describing features of the file hidden by the conversion. See
> +     * the AV_TEXT_FLAG_* constants below.
> +     */
> +    unsigned text_flags;
> +
> +    /**
> +     * Error message. If something fails, this field will contain a
> +     * human-readable error message.
> +     */
> +    char error[128];
> +
> +} AVTextFile;
> +
> +/**
> + * Processing flags.
> + * The following constants apply to the AVTextFile.flags field.
> + */
> +enum {
> +
> +    /**
> +     * Split the file into individual lines.
> +     * The newline characters are replaced by 0.
> +     */
> +    AV_TEXT_FLAG_SPLIT_LINES           = 0x1,
> +
> +    /**
> +     * Remove CR (\r) before LF (\n).
> +     * In other words, convert DOS-style line breaks to Unix-style.
> +     */
> +    AV_TEXT_FLAG_REMOVE_CR             = 0x2,
> +};
> +
> +/**
> + * Result flags.
> + * The following constants apply to the AVTextFile.text_flags field.
> + */
> +enum {
> +
> +    /**
> +     * The file had a byte order mark.
> +     * The first character of the file was U+FEFF ZERO WIDTH NO-BREAK SPACE.
> +     */
> +    AV_TEXT_FLAG_HAS_BOM               = 0x1,
> +
> +    /**
> +     * The final line of the file was not terminated by a final LF (\n).
> +     * Only relevant if lines were split.
> +     */
> +    AV_TEXT_FLAG_NO_EOL                = 0x2,
> +};
> +
> +/**
> + * Callback to read from a file.
> + * @param opaque    opaque value passed from the caller
> + * @param buf       buffer to fill with the file data
> + * @param buf_size  size of the buffer
> + * @return  the number of bytes read or a negative error code
> + */
> +typedef int (*AVTextFileRead)(void *opaque, unsigned char *buf, int buf_size);
> +
> +/**
> + * Read a text file from a callback.
> + */
> +int av_text_file_read_callback(AVTextFile *tf,
> +                               AVTextFileRead callback, void *opaque);
> +
> +/**
> + * Read a text file from the local file system (using stdio).
> + */
> +int av_text_file_read_file(AVTextFile *tf, const char *filename);
> +
> +/**
> + * Free all memory allocated while reading the file.
> + * The corresponding fields are set to NULL.
> + */
> +void av_text_file_free(AVTextFile *tf);
> +
> +#define AV_TEXT_FILE_DEFAULT sizeof(AVTextFile)
> +
> +#endif /* AVUTIL_TEXT_FILE_H */

All in all, this patch adds lots of code without actually improving
anything, except testing for UTF-16.

Note that there is a more robust way to detect UTF-16 (even if there
are no BOMs available), for that look what MPlayer's subreader.c does.