[FFmpeg-devel] [PATCH 2/5] lavu: add text_file API.
wm4
nfxjfg at googlemail.com
Sat Aug 10 14:48:41 CEST 2013
On Thu, 8 Aug 2013 17:03:32 +0200
Nicolas George <nicolas.george at normalesup.org> wrote:
> TODO: version bump, APIChanges entry
>
> Signed-off-by: Nicolas George <nicolas.george at normalesup.org>
> ---
> libavutil/Makefile | 1 +
> libavutil/text_file.c | 284 +++++++++++++++++++++++++++++++++++++++++++++++++
> libavutil/text_file.h | 181 +++++++++++++++++++++++++++++++
> 3 files changed, 466 insertions(+)
> create mode 100644 libavutil/text_file.c
> create mode 100644 libavutil/text_file.h
>
> diff --git a/libavutil/Makefile b/libavutil/Makefile
> index 21746f0..7d59a73 100644
> --- a/libavutil/Makefile
> +++ b/libavutil/Makefile
> @@ -107,6 +107,7 @@ OBJS = adler32.o \
> samplefmt.o \
> sha.o \
> sha512.o \
> + text_file.o \
> time.o \
> timecode.o \
> tree.o \
> diff --git a/libavutil/text_file.c b/libavutil/text_file.c
> new file mode 100644
> index 0000000..7d6c35d
> --- /dev/null
> +++ b/libavutil/text_file.c
> @@ -0,0 +1,284 @@
> +/*
> + * Copyright (c) 2013 Nicolas George
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public License
> + * as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public License
> + * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "avassert.h"
> +#include "avstring.h"
> +#include "bprint.h"
> +#include "internal.h"
> +#include "text_file.h"
> +
> +#define COPY_FROM_USER(var) \
> + av_assert0(var ## _user->struct_size <= sizeof(var)); \
> + memcpy(&var, var ## _user, var ## _user->struct_size);
> +#define COPY_TO_USER(var) \
> + memcpy(var ## _user, &var, var ## _user->struct_size);
> +
> +static const struct {
> + unsigned char encoding[9], bom[4], len;
> +} byte_order_marks[] = {
> + { "UTF-8", "\xef\xbb\xbf", 3 },
> + { "UCS-4BE", "\x00\x00\xfe\xff", 4 },
> + { "UCS-4LE", "\xff\xfe\x00\x00", 4 },
> + { "UTF-16BE", "\xfe\xff", 2 },
> + { "UTF-16LE", "\xff\xfe", 2 },
> +};
> +
> +static const char *const default_encodings[] = {
> + "UTF-8",
> + "US-ASCII",
> + "WINDOWS-1252",
> + "ISO-8859-1",
> + NULL
> +};
Makes no sense, see below.
> +#if CONFIG_ICONV
> +
> +#include <iconv.h>
> +
> +static int try_encoding(AVTextFile *tf, const char *encoding)
> +{
> + iconv_t cd;
> + AVBPrint bp;
> + char *inbuf, *outbuf, *recoded;
> + size_t insize, outsize, insize_orig;
> + unsigned outsize_int;
> + int ret = 0;
> +
> + if ((cd = iconv_open("UTF-8", encoding)) == (iconv_t)-1)
> + return AVERROR(errno);
> + av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
> + inbuf = tf->full_data;
> + insize = tf->full_data_size;
> + while (insize) {
> + av_bprint_get_buffer(&bp, 512, (unsigned char **)&outbuf, &outsize_int);
> + if (outsize_int <= 1) {
> + ret = AVERROR(ENOMEM);
> + break;
> + }
> + outsize_int--;
> + outsize = outsize_int;
> + insize_orig = insize;
> + iconv(cd, &inbuf, &insize, &outbuf, &outsize);
> + if (insize == insize_orig) {
> + ret = AVERROR_INVALIDDATA;
> + break;
> + }
> + bp.len += outsize_int - outsize;
> + }
> + iconv_close(cd);
> + if (ret < 0) {
> + av_bprint_finalize(&bp, NULL);
> + return ret;
> + }
> + av_assert1(!insize);
> + bp.str[bp.len] = 0;
> + if ((ret = av_bprint_finalize(&bp, &recoded)) < 0)
> + return ret;
> + av_free(tf->full_data);
> + tf->full_data = recoded;
> + tf->full_data_size = bp.len;
> + tf->encoding = encoding;
> + return 0;
> +}
> +
> +#else
> +
> +static int try_encoding(AVTextFile *tf, const char *encoding)
> +{
> + if (av_strcasecmp(encoding, "UTF-8")) {
> + av_strlcpy(tf->error, "Character encoding conversion not supported, "
> + "enable iconv at compilation", sizeof(tf->error));
> + return AVERROR(ENOSYS);
> + }
> + int r = avpriv_utf8_check(tf->full_data, tf->full_data_size);
> + av_log(0, 16, "r = %d\n", r);
> + return avpriv_utf8_check(tf->full_data, tf->full_data_size) ? 0 :
> + AVERROR_INVALIDDATA;
> +}
> +
> +#endif
> +
> +static int guess_encoding(AVTextFile *tf)
> +{
> + const char *bom_encoding[2] = { NULL, NULL };
> + const char *const *encodings;
> + int ret, i;
> +
> + encodings = tf->encodings;
> + if (!encodings) {
> + for (i = 0; i < FF_ARRAY_ELEMS(byte_order_marks); i++) {
> + if (!memcmp(tf->full_data, byte_order_marks[i].bom,
> + byte_order_marks[i].len)) {
> + encodings = bom_encoding;
> + bom_encoding[0] = byte_order_marks[i].encoding;
> + break;
> + }
> + }
> + if (!encodings)
> + encodings = default_encodings;
> + }
> +
> + ret = AVERROR_INVALIDDATA;
> + for (i = 0; encodings[i]; i++)
> + if ((ret = try_encoding(tf, encodings[i])) >= 0)
> + return ret;
> +
> + if (!*tf->error)
> + av_strlcpy(tf->error, "Unable to guess character encoding",
> + sizeof(tf->error));
> + return ret;
> +}
This does not work. You're trying to detect legacy 8-bit codepage
encodings by testing whether conversion to iconv works.
For example, as far as I remember, ISO-8859-1 is at best a
subset of WINDOWS-1252, so your code will never detect ISO-8859-1.
This kind of detection will most likely lead to broken text, which is
silently accepted without even printing a warning anywhere.
Additionally, the application (and not even a ffmpeg.c user) has the
slightest control over how character set detection and conversion is
handled, so this automagic fails to do anything useful and is strictly
a disimprovement over the previous code. (Except that it can read
UTF-16 encoded files now.)
> +static void remove_cr(AVTextFile *tf)
> +{
> + uint8_t *p, *q, *end;
> +
> + p = q = tf->text;
> + end = p + tf->text_size;
> + for (; p < end; p++)
> + if (*p != '\r' || p[1] != '\n')
> + *(q++) = *p;
> + tf->text_size = q - tf->text;
> + *(q++) = 0;
> +}
> +
> +static int split_lines(AVTextFile *tf)
> +{
> + size_t i, nb_lines = 0;
> + uint8_t *p, *end = tf->text + tf->text_size;
> +
> + if (tf->text_size) {
> + nb_lines++;
> + for (p = tf->text; p < end - 1; p++)
> + if (*p == '\n')
> + nb_lines++;
> + }
> + tf->lines = av_calloc(nb_lines + 1, sizeof(*tf->lines));
> + tf->lines[0] = p = tf->text;
Not checking for av_calloc result.
> + for (i = 1; i < nb_lines; i++) {
> + p = memchr(p, '\n', end - p);
> + av_assert1(p);
> + *p = 0;
> + tf->lines[i] = ++p;
> + }
> + if (tf->text_size) {
> + if ((p = memchr(p, '\n', end - p))) {
> + av_assert1(p == end - 1);
> + *p = 0;
> + } else {
> + tf->text_flags |= AV_TEXT_FLAG_NO_EOL;
> + }
> + }
> + tf->nb_lines = nb_lines;
> + return 0;
> +}
Why does it create an array of lines? This seems slow, complex, and
unnecessary. Separate \n and \r processing also prevents you from
treating MacOS line endings correctly.
> +static int text_file_process(AVTextFile *tf)
> +{
> + int ret;
> +
> + tf->text_flags = 0;
> + if ((ret = guess_encoding(tf)) < 0)
> + return ret;
> + tf->text = tf->full_data;
> + tf->text_size = tf->full_data_size;
> +
> + if (!memcmp(tf->text, byte_order_marks[0].bom, 3)) {
> + tf->text_size -= 3;
> + tf->text += 3;
> + tf->text_flags |= AV_TEXT_FLAG_HAS_BOM;
> + }
> +
> + if ((tf->flags & AV_TEXT_FLAG_REMOVE_CR))
> + remove_cr(tf);
> + if ((tf->flags & AV_TEXT_FLAG_SPLIT_LINES))
> + if ((ret = split_lines(tf)) < 0)
> + return ret;
> +
> + return 0;
> +}
> +
> +static int text_file_try_read(AVTextFile *tf,
> + AVTextFileRead callback, void *opaque)
> +{
> + AVBPrint bp;
> + unsigned buf_size;
> + uint8_t *buf;
> + int ret;
> +
> + av_bprint_init(&bp, 0, AV_BPRINT_SIZE_UNLIMITED);
> + while (1) {
> + av_bprint_get_buffer(&bp, 512, &buf, &buf_size);
> + if (buf_size <= 1) {
> + ret = AVERROR(ENOMEM);
> + break;
> + }
> + ret = callback(opaque, buf, FFMIN(buf_size - 1, INT_MAX));
> + if (ret < 0) {
> + if (ret == AVERROR_EOF)
> + ret = 0;
> + break;
> + }
> + bp.len += ret;
> + }
> +
> + if (ret < 0) {
> + av_bprint_finalize(&bp, NULL);
> + return ret;
> + }
> + if ((ret = av_bprint_finalize(&bp, (char **)&tf->full_data)) < 0)
> + return ret;
> + tf->full_data_size = bp.len;
> + return text_file_process(tf);
> +}
> +
> +static int text_file_read(AVTextFile *tf,
> + AVTextFileRead callback, void *opaque)
> +{
> + int ret;
> +
> + *tf->error = 0;
> + if ((ret = text_file_try_read(tf, callback, opaque)) < 0) {
> + if (!*tf->error)
> + av_strerror(ret, tf->error, sizeof(tf->error));
> + av_text_file_free(tf);
> + }
> + return ret;
> +}
> +
> +void av_text_file_free(AVTextFile *tf)
> +{
> + tf->text = NULL;
> + av_freep(&tf->lines);
> + av_freep(&tf->full_data);
> + tf->text_size = tf->full_data_size = 0;
> +}
> +
> +int av_text_file_read_callback(AVTextFile *tf_user,
> + AVTextFileRead callback, void *opaque)
> +{
> + AVTextFile tf = { 0 };
> + int ret;
> +
> + COPY_FROM_USER(tf);
> + ret = text_file_read(&tf, callback, opaque);
> + COPY_TO_USER(tf);
> + return ret;
> +}
Why does this use its own callback facility instead of using avio?
> diff --git a/libavutil/text_file.h b/libavutil/text_file.h
> new file mode 100644
> index 0000000..d1bfbd3
> --- /dev/null
> +++ b/libavutil/text_file.h
> @@ -0,0 +1,181 @@
> +/*
> + * Copyright (c) 2013 Nicolas George
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public License
> + * as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public License
> + * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#ifndef AVUTIL_TEXT_FILE_H
> +#define AVUTIL_TEXT_FILE_H
> +
> +#include "common.h"
> +
> +/**
> + * Structure to help read text files.
> + * This API allows to read text files (or other ways of storing text) while
> + * handling the subtleties of character encodings, end-of-line separators,
> + * etc.
> + *
> + * The text returned by this function is always recoded to UTF-8.
> + *
> + * The typical way of using this API is to declare a AVTextFile variable
> + * with the default initialization macro:
> + * AVTextFile tf = { AV_TEXT_FILE_DEFAULT };
> + * Then set fields to control various parts of the process and use it with
> + * the API functions.
> + */
> +typedef struct AVTextFile {
> +
> + /**
> + * Size of the structure; must be set to sizeof(AVTextFile) to ensure
> + * compatibility with later versions of the library.
> + */
> +
> + size_t struct_size;
> +
> + /**
> + * Text read from the file. Always terminated by an additional 0.
> + */
> + uint8_t *text;
> +
> + /**
> + * Size of text, in bytes, not counting the additional terminating 0.
> + */
> + size_t text_size;
> +
> + /**
> + * Full data buffer containing the text; must be freed with av_free()
> + * when no longer needed. Can be different from text due to details such
> + * as byte-order-marks.
> + */
> + uint8_t *full_data;
> +
> + /**
> + * Size of full_data, in bytes, not counting the additional 0.
> + */
> + size_t full_data_size;
> +
> + /**
> + * Detected encoding; will point to either a static string or an element
> + * of the encodings field.
> + */
> + const char *encoding;
> +
> + /**
> + * List of encodings for audodetection, terminated by NULL.
> + * The first encoding in this list that can apply to the file is used.
> + */
> + const char *const *encodings;
> +
> + /**
> + * Lines of the file; only relevant if AV_TEXT_FLAG_SPLIT_LINES is set.
> + * Terminated by an additional NULL pointer.
> + */
> + char **lines;
> +
> + /**
> + * Number of elements in the lines array, not counting the additional
> + * NULL.
> + */
> + size_t nb_lines;
> +
> + /**
> + * Flags to control the processing of the file. See the AV_TEXT_FLAG_*
> + * constants below.
> + */
> + unsigned flags;
> +
> + /**
> + * Flags describing features of the file hidden by the conversion. See
> + * the AV_TEXT_FLAG_* constants below.
> + */
> + unsigned text_flags;
> +
> + /**
> + * Error message. If something fails, this field will contain a
> + * human-readable error message.
> + */
> + char error[128];
> +
> +} AVTextFile;
> +
> +/**
> + * Processing flags.
> + * The following constants apply to the AVTextFile.flags field.
> + */
> +enum {
> +
> + /**
> + * Split the file into individual lines.
> + * The newline characters are replaced by 0.
> + */
> + AV_TEXT_FLAG_SPLIT_LINES = 0x1,
> +
> + /**
> + * Remove CR (\r) before LF (\n).
> + * In other words, convert DOS-style line breaks to Unix-style.
> + */
> + AV_TEXT_FLAG_REMOVE_CR = 0x2,
> +};
> +
> +/**
> + * Result flags.
> + * The following constants apply to the AVTextFile.text_flags field.
> + */
> +enum {
> +
> + /**
> + * The file had a byte order mark.
> + * The first character of the file was U+FEFF ZERO WIDTH NO-BREAK SPACE.
> + */
> + AV_TEXT_FLAG_HAS_BOM = 0x1,
> +
> + /**
> + * The final line of the file was not terminated by a final LF (\n).
> + * Only relevant if lines were split.
> + */
> + AV_TEXT_FLAG_NO_EOL = 0x2,
> +};
> +
> +/**
> + * Callback to read from a file.
> + * @param opaque opaque value passed from the caller
> + * @param buf buffer to fill with the file data
> + * @param buf_size size of the buffer
> + * @return the number of bytes read or a negative error code
> + */
> +typedef int (*AVTextFileRead)(void *opaque, unsigned char *buf, int buf_size);
> +
> +/**
> + * Read a text file from a callback.
> + */
> +int av_text_file_read_callback(AVTextFile *tf,
> + AVTextFileRead callback, void *opaque);
> +
> +/**
> + * Read a text file from the local file system (using stdio).
> + */
> +int av_text_file_read_file(AVTextFile *tf, const char *filename);
> +
> +/**
> + * Free all memory allocated while reading the file.
> + * The corresponding fields are set to NULL.
> + */
> +void av_text_file_free(AVTextFile *tf);
> +
> +#define AV_TEXT_FILE_DEFAULT sizeof(AVTextFile)
> +
> +#endif /* AVUTIL_TEXT_FILE_H */
All in all, this patch adds lots of code without actually improving
anything, except testing for UTF-16.
Note that there is a more robust way to detect UTF-16 (even if there
are no BOMs available), for that look what MPlayer's subreader.c does.
More information about the ffmpeg-devel
mailing list