[FFmpeg-cvslog] lavu/avstring: add av_utf8_decode() function
    Stefano Sabatini 
    git at videolan.org
       
    Fri Nov 22 16:55:20 CET 2013
    
    
  
ffmpeg | branch: master | Stefano Sabatini <stefasab at gmail.com> | Thu Oct  3 01:21:40 2013 +0200| [68590650f05f2bf97766362f2817372987c8a52e] | committer: Stefano Sabatini
lavu/avstring: add av_utf8_decode() function
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=68590650f05f2bf97766362f2817372987c8a52e
---
 doc/APIchanges       |    3 +++
 libavutil/Makefile   |    1 +
 libavutil/avstring.c |   64 +++++++++++++++++++++++++++++++++++++++++++++
 libavutil/avstring.h |   40 ++++++++++++++++++++++++++++
 libavutil/utf8.c     |   71 ++++++++++++++++++++++++++++++++++++++++++++++++++
 libavutil/version.h  |    2 +-
 6 files changed, 180 insertions(+), 1 deletion(-)
diff --git a/doc/APIchanges b/doc/APIchanges
index c0edb64..618df2a 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -15,6 +15,9 @@ libavutil:     2012-10-22
 
 API changes, most recent first:
 
+2013-11-XX - xxxxxxx - lavu 52.54.100 - avstring.h
+  Add av_utf8_decode() function.
+
 2013-11-xx - xxxxxxx - lavc 55.44.100 - avcodec.h
   Add av_packet_{un,}pack_dictionary()
   Add AV_PKT_METADATA_UPDATE side data type, used to transmit key/value
diff --git a/libavutil/Makefile b/libavutil/Makefile
index 9b5cd4e..02dd728 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
@@ -157,6 +157,7 @@ TESTPROGS = adler32                                                     \
             sha                                                         \
             sha512                                                      \
             tree                                                        \
+            utf8                                                        \
             xtea                                                        \
 
 TESTPROGS-$(HAVE_LZO1X_999_COMPRESS) += lzo
diff --git a/libavutil/avstring.c b/libavutil/avstring.c
index eed58fa..2093107 100644
--- a/libavutil/avstring.c
+++ b/libavutil/avstring.c
@@ -307,6 +307,70 @@ int av_isxdigit(int c)
     return av_isdigit(c) || (c >= 'a' && c <= 'f');
 }
 
+int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_end,
+                   unsigned int flags)
+{
+    const uint8_t *p = *bufp;
+    uint32_t top;
+    uint64_t code;
+    int ret = 0;
+
+    if (p >= buf_end)
+        return 0;
+
+    code = *p++;
+
+    /* first sequence byte starts with 10, or is 1111-1110 or 1111-1111,
+       which is not admitted */
+    if ((code & 0xc0) == 0x80 || code >= 0xFE) {
+        ret = AVERROR(EILSEQ);
+        goto end;
+    }
+    top = (code & 128) >> 1;
+
+    while (code & top) {
+        int tmp;
+        if (p >= buf_end) {
+            ret = AVERROR(EILSEQ); /* incomplete sequence */
+            goto end;
+        }
+
+        /* we assume the byte to be in the form 10xx-xxxx */
+        tmp = *p++ - 128;   /* strip leading 1 */
+        if (tmp>>6) {
+            ret = AVERROR(EILSEQ);
+            goto end;
+        }
+        code = (code<<6) + tmp;
+        top <<= 5;
+    }
+    code &= (top << 1) - 1;
+
+    if (code >= 1<<31) {
+        ret = AVERROR(EILSEQ);  /* out-of-range value */
+        goto end;
+    }
+
+    *codep = code;
+
+    if (code > 0x10FFFF &&
+        !(flags & AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES))
+        ret = AVERROR(EILSEQ);
+    if (code < 0x20 && code != 0x9 && code != 0xA && code != 0xD &&
+        flags & AV_UTF8_FLAG_EXCLUDE_XML_INVALID_CONTROL_CODES)
+        ret = AVERROR(EILSEQ);
+    if (code >= 0xD800 && code <= 0xDFFF &&
+        !(flags & AV_UTF8_FLAG_ACCEPT_SURROGATES))
+        ret = AVERROR(EILSEQ);
+    if (code == 0xFFFE || code == 0xFFFF &&
+        (!flags & AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS))
+        ret = AVERROR(EILSEQ);
+
+end:
+    *bufp = p;
+    return ret;
+}
+
 #ifdef TEST
 
 int main(void)
diff --git a/libavutil/avstring.h b/libavutil/avstring.h
index 438ef79..882a2b5 100644
--- a/libavutil/avstring.h
+++ b/libavutil/avstring.h
@@ -22,6 +22,7 @@
 #define AVUTIL_AVSTRING_H
 
 #include <stddef.h>
+#include <stdint.h>
 #include "attributes.h"
 
 /**
@@ -295,6 +296,45 @@ enum AVEscapeMode {
 int av_escape(char **dst, const char *src, const char *special_chars,
               enum AVEscapeMode mode, int flags);
 
+#define AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES          1 ///< accept codepoints over 0x10FFFF
+#define AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS             2 ///< accept non-characters - 0xFFFE and 0xFFFF
+#define AV_UTF8_FLAG_ACCEPT_SURROGATES                 4 ///< accept UTF-16 surrogates codes
+#define AV_UTF8_FLAG_EXCLUDE_XML_INVALID_CONTROL_CODES 8 ///< exclude control codes not accepted by XML
+
+#define AV_UTF8_FLAG_ACCEPT_ALL \
+    AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES|AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS|AV_UTF8_FLAG_ACCEPT_SURROGATES
+
+/**
+ * Read and decode a single UTF-8 code point (character) from the
+ * buffer in *buf, and update *buf to point to the next byte to
+ * decode.
+ *
+ * In case of an invalid byte sequence, the pointer will be updated to
+ * the next byte after the invalid sequence and the function will
+ * return an error code.
+ *
+ * Depending on the specified flags, the function will also fail in
+ * case the decoded code point does not belong to a valid range.
+ *
+ * @note For speed-relevant code a carefully implemented use of
+ * GET_UTF8() may be preferred.
+ *
+ * @param codep   pointer used to return the parsed code in case of success.
+ *                The value in *codep is set even in case the range check fails.
+ * @param bufp    pointer to the address the first byte of the sequence
+ *                to decode, updated by the function to point to the
+ *                byte next after the decoded sequence
+ * @param buf_end pointer to the end of the buffer, points to the next
+ *                byte past the last in the buffer. This is used to
+ *                avoid buffer overreads (in case of an unfinished
+ *                UTF-8 sequence towards the end of the buffer).
+ * @param flags   a collection of AV_UTF8_FLAG_* flags
+ * @return >= 0 in case a sequence was successfully read, a negative
+ * value in case of invalid sequence
+ */
+int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_end,
+                   unsigned int flags);
+
 /**
  * @}
  */
diff --git a/libavutil/utf8.c b/libavutil/utf8.c
new file mode 100644
index 0000000..37a2802
--- /dev/null
+++ b/libavutil/utf8.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2013 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+
+#include "libavutil/avstring.h"
+#include "libavutil/file.h"
+
+static void print_sequence(const char *p, int l, int indent)
+{
+    int i;
+    for (i = 0; i < l; i++)
+        printf("%02X", (uint8_t)p[i]);
+    printf("%*s", indent-l*2, "");
+}
+
+int main(int argc, char **argv)
+{
+    int ret;
+    char *filename = argv[1];
+    uint8_t *file_buf;
+    size_t file_buf_size;
+    uint32_t code;
+    const uint8_t *p, *endp;
+
+    ret = av_file_map(filename, &file_buf, &file_buf_size, 0, NULL);
+    if (ret < 0)
+        return 1;
+
+    p = file_buf;
+    endp = file_buf + file_buf_size;
+    while (p < endp) {
+        int l, r;
+        const uint8_t *p0 = p;
+        code = UINT32_MAX;
+        r = av_utf8_decode(&code, &p, endp, 0);
+        l = (int)(p-p0);
+        print_sequence(p0, l, 20);
+        if (code != UINT32_MAX) {
+            printf("%-10d 0x%-10X %-5d ", code, code, l);
+            if (r >= 0) {
+                if (*p0 == '\n') printf("\\n\n");
+                else             printf ("%.*s\n", l, p0);
+            } else {
+                printf("invalid code range\n");
+            }
+        } else {
+            printf("invalid sequence\n");
+        }
+    }
+
+    av_file_unmap(file_buf, file_buf_size);
+    return 0;
+}
diff --git a/libavutil/version.h b/libavutil/version.h
index 3e64a20..c01da93 100644
--- a/libavutil/version.h
+++ b/libavutil/version.h
@@ -75,7 +75,7 @@
  */
 
 #define LIBAVUTIL_VERSION_MAJOR  52
-#define LIBAVUTIL_VERSION_MINOR  53
+#define LIBAVUTIL_VERSION_MINOR  54
 #define LIBAVUTIL_VERSION_MICRO 100
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
    
    
More information about the ffmpeg-cvslog
mailing list