[PATCH] id3v2: add support for UTF-16 encoding.
Anton Khirnov
wyskas
Sun Sep 6 13:34:48 CEST 2009
---
libavformat/id3v2.c | 34 ++++++++++++++++++++++++++++++++++
libavutil/common.h | 26 ++++++++++++++++++++++++++
2 files changed, 60 insertions(+), 0 deletions(-)
diff --git a/libavformat/id3v2.c b/libavformat/id3v2.c
index 0cf2cb1..fe79c01 100644
--- a/libavformat/id3v2.c
+++ b/libavformat/id3v2.c
@@ -81,6 +81,7 @@ static void read_ttag(AVFormatContext *s, int taglen, const char *key)
char *q, dst[512];
int len, dstlen = sizeof(dst) - 1;
unsigned genre;
+ unsigned int (*get)(ByteIOContext*) = NULL;
dst[0] = 0;
if (taglen < 1)
@@ -99,11 +100,44 @@ static void read_ttag(AVFormatContext *s, int taglen, const char *key)
*q = '\0';
break;
+ case 1: /* UTF-16 with BOM */
+ taglen -= 2;
+ switch (get_be16(s->pb)) {
+ case 0xfeff:
+ get = get_be16;
+ break;
+ case 0xfffe:
+ get = get_le16;
+ break;
+ default:
+ av_log(s, AV_LOG_ERROR, "Incorrect BOM value.\n");
+ return;
+ }
+ // fall-through
+
+ case 2: /* UTF-16BE without BOM */
+ if (!get)
+ get = get_be16;
+
+ q = dst;
+ while (taglen > 1) {
+ uint32_t ch;
+ uint8_t tmp;
+
+ GET_UTF16(ch, get(s->pb), break;, len)
+ PUT_UTF8(ch, tmp, if (q - dst < dstlen -1) *q++ = tmp;)
+ taglen -= len;
+ }
+ *q = '\0';
+ break;
+
case 3: /* UTF-8 */
len = FFMIN(taglen, dstlen - 1);
get_buffer(s->pb, dst, len);
dst[len] = 0;
break;
+ default:
+ av_log(s, AV_LOG_WARNING, "Unknown encoding in tag %s\n.", key);
}
if (!strcmp(key, "genre")
diff --git a/libavutil/common.h b/libavutil/common.h
index 0797a79..458e32f 100644
--- a/libavutil/common.h
+++ b/libavutil/common.h
@@ -266,6 +266,32 @@ static inline av_const int av_ceil_log2(int x)
}
/*!
+ * \def GET_UTF16(val, GET_BYTE, ERROR)
+ * Converts a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form
+ * \param val is the output and should be of type uint32_t. It holds the converted
+ * UCS-4 character and should be a left value.
+ * \param GET_BYTE gets UTF-16 encoded bytes from any proper source. It can be
+ * a function or a statement whose return value or evaluated value is of type
+ * uint16_t. It will be executed up to 2 times.
+ * \param ERROR action that should be taken when an invalid UTF-16 surrogate is
+ * returned from GET_BYTE. It should be a statement that jumps out of the macro,
+ * like exit(), goto, return, break, or continue.
+ * \param read should be an int and is set to the number of bytes read (2 or 4).
+ */
+#define GET_UTF16(val, GET_BYTE, ERROR, read)\
+ {\
+ val = GET_BYTE;\
+ read = 2;\
+ if (val >= 0xD800 && val <= 0xDBFF) {\
+ val = ((val - 0xD800)<<10) + (get(s->pb) - 0xDC00) + 0x0010000UL;\
+ read += 2;\
+ }\
+ else if (val > 0xDBFF && val <= 0xDFFF) {\
+ ERROR\
+ }\
+ }
+
+/*!
* \def PUT_UTF8(val, tmp, PUT_BYTE)
* Converts a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long).
* \param val is an input-only argument and should be of type uint32_t. It holds
--
1.6.3.3
--cNdxnHkX5QqsyA0e--
More information about the ffmpeg-devel
mailing list