From f05b33b3a1cff40972a735ff1fb4ed6e8bfeaf2a Mon Sep 17 00:00:00 2001 From: Daisuke Miyakawa Date: Tue, 30 Jun 2009 20:40:42 +0900 Subject: [PATCH] Add useful functions to String8, which enables users to convert between UTF-8 and UTF-32 It will be used in SQL functions in external/sqlite/android. See https://android-git.corp.google.com/g/Gerrit#change,5511 for example. Related internal bug id: 1707173 --- include/utils/String8.h | 116 ++++++++++++++++++- libs/utils/String8.cpp | 298 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 383 insertions(+), 31 deletions(-) diff --git a/include/utils/String8.h b/include/utils/String8.h index c49faf6fe062..ecc57743796d 100644 --- a/include/utils/String8.h +++ b/include/utils/String8.h @@ -29,11 +29,107 @@ // --------------------------------------------------------------------------- +extern "C" { + +typedef uint32_t char32_t; + +size_t strlen32(const char32_t *); +size_t strnlen32(const char32_t *, size_t); + +/* + * Returns the length of "src" when "src" is valid UTF-8 string. + * Returns 0 if src is NULL, 0-length string or non UTF-8 string. + * This function should be used to determine whether "src" is valid UTF-8 + * characters with valid unicode codepoints. "src" must be null-terminated. + * + * If you are going to use other GetUtf... functions defined in this header + * with string which may not be valid UTF-8 with valid codepoint (form 0 to + * 0x10FFFF), you should use this function before calling others, since the + * other functions do not check whether the string is valid UTF-8 or not. + * + * If you do not care whether "src" is valid UTF-8 or not, you should use + * strlen() as usual, which should be much faster. + */ +size_t utf8_length(const char *src); + +/* + * Returns the UTF-32 length of "src". + */ +size_t utf32_length(const char *src, size_t src_len); + +/* + * Returns the UTF-8 length of "src". + */ +size_t utf8_length_from_utf32(const char32_t *src, size_t src_len); + +/* + * Returns the unicode value at "index". + * Returns -1 when the index is invalid (equals to or more than "src_len"). + * If returned value is positive, it is able to be converted to char32_t, which + * is unsigned. Then, if "next_index" is not NULL, the next index to be used is + * stored in "next_index". "next_index" can be NULL. + */ +int32_t utf32_at(const char *src, size_t src_len, + size_t index, size_t *next_index); + +/* + * Stores a UTF-32 string converted from "src" in "dst", if "dst_length" is not + * large enough to store the string, the part of the "src" string is stored + * into "dst". + * Returns the size actually used for storing the string. + * "dst" is not null-terminated when dst_len is fully used (like strncpy). + */ +size_t utf8_to_utf32(const char* src, size_t src_len, + char32_t* dst, size_t dst_len); + +/* + * Stores a UTF-8 string converted from "src" in "dst", if "dst_length" is not + * large enough to store the string, the part of the "src" string is stored + * into "dst" as much as possible. See the examples for more detail. + * Returns the size actually used for storing the string. + * dst" is not null-terminated when dst_len is fully used (like strncpy). + * + * Example 1 + * "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84) + * "src_len" == 2 + * "dst_len" >= 7 + * -> + * Returned value == 6 + * "dst" becomes \xE3\x81\x82\xE3\x81\x84\0 + * (note that "dst" is null-terminated) + * + * Example 2 + * "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84) + * "src_len" == 2 + * "dst_len" == 5 + * -> + * Returned value == 3 + * "dst" becomes \xE3\x81\x82\0 + * (note that "dst" is null-terminated, but \u3044 is not stored in "dst" + * since "dst" does not have enough size to store the character) + * + * Example 3 + * "src" == \u3042\u3044 (\xE3\x81\x82\xE3\x81\x84) + * "src_len" == 2 + * "dst_len" == 6 + * -> + * Returned value == 6 + * "dst" becomes \xE3\x81\x82\xE3\x81\x84 + * (note that "dst" is NOT null-terminated, like strncpy) + */ +size_t utf32_to_utf8(const char32_t* src, size_t src_len, + char* dst, size_t dst_len); + +} + +// --------------------------------------------------------------------------- + namespace android { class TextOutput; -//! This is a string holding UTF-8 characters. +//! This is a string holding UTF-8 characters. Does not allow the value more +// than 0x10FFFF, which is not valid unicode codepoint. class String8 { public: @@ -45,7 +141,8 @@ public: explicit String8(const String16& o); explicit String8(const char16_t* o); explicit String8(const char16_t* o, size_t numChars); - + explicit String8(const char32_t* o); + explicit String8(const char32_t* o, size_t numChars); ~String8(); inline const char* string() const; @@ -59,11 +156,20 @@ public: status_t setTo(const char* other); status_t setTo(const char* other, size_t numChars); status_t setTo(const char16_t* other, size_t numChars); - + status_t setTo(const char32_t* other, + size_t length); + status_t append(const String8& other); status_t append(const char* other); status_t append(const char* other, size_t numChars); + // Note that this function takes O(N) time to calculate the value. + // No cache value is stored. + size_t getUtf32Length() const; + int32_t getUtf32At(size_t index, + size_t *next_index) const; + size_t getUtf32(char32_t* dst, size_t dst_len) const; + inline String8& operator=(const String8& other); inline String8& operator=(const char* other); @@ -103,7 +209,7 @@ public: void toLower(size_t start, size_t numChars); void toUpper(); void toUpper(size_t start, size_t numChars); - + /* * These methods operate on the string as if it were a path name. */ @@ -346,7 +452,7 @@ inline String8::operator const char*() const return mString; } -}; // namespace android +} // namespace android // --------------------------------------------------------------------------- diff --git a/libs/utils/String8.cpp b/libs/utils/String8.cpp index c50d343a731b..71bf3ce7e780 100644 --- a/libs/utils/String8.cpp +++ b/libs/utils/String8.cpp @@ -25,25 +25,39 @@ #include -namespace android { +/* + * Functions outside android is below the namespace android, since they use + * functions and constants in android namespace. + */ // --------------------------------------------------------------------------- -static const uint32_t kByteMask = 0x000000BF; -static const uint32_t kByteMark = 0x00000080; +namespace android { + +static const char32_t kByteMask = 0x000000BF; +static const char32_t kByteMark = 0x00000080; // Surrogates aren't valid for UTF-32 characters, so define some // constants that will let us screen them out. -static const uint32_t kUnicodeSurrogateHighStart = 0x0000D800; -static const uint32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; -static const uint32_t kUnicodeSurrogateLowStart = 0x0000DC00; -static const uint32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; -static const uint32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; -static const uint32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; +static const char32_t kUnicodeSurrogateHighStart = 0x0000D800; +static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF; +static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00; +static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF; +static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart; +static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd; +static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF; // Mask used to set appropriate bits in first byte of UTF-8 sequence, // indexed by number of bytes in the sequence. -static const uint32_t kFirstByteMark[] = { +// 0xxxxxxx +// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000 +// 110yyyyx 10xxxxxx +// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0 +// 1110yyyy 10yxxxxx 10xxxxxx +// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0 +// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx +// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0 +static const char32_t kFirstByteMark[] = { 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 }; @@ -52,7 +66,7 @@ static const uint32_t kFirstByteMark[] = { #define RES_PATH_SEPARATOR '/' // Return number of utf8 bytes required for the character. -static size_t utf32_to_utf8_bytes(uint32_t srcChar) +static size_t utf32_to_utf8_bytes(char32_t srcChar) { size_t bytesToWrite; @@ -79,7 +93,7 @@ static size_t utf32_to_utf8_bytes(uint32_t srcChar) } } // Max code point for Unicode is 0x0010FFFF. - else if (srcChar < 0x00110000) + else if (srcChar <= kUnicodeMaxCodepoint) { bytesToWrite = 4; } @@ -94,7 +108,7 @@ static size_t utf32_to_utf8_bytes(uint32_t srcChar) // Write out the source character to . -static void utf32_to_utf8(uint8_t* dstP, uint32_t srcChar, size_t bytes) +static void utf32_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes) { dstP += bytes; switch (bytes) @@ -126,7 +140,7 @@ void initialize_string8() // Bite me, Darwin! gDarwinIsReallyAnnoying = gDarwinCantLoadAllObjects; #endif - + SharedBuffer* buf = SharedBuffer::alloc(1); char* str = (char*)buf->data(); *str = 0; @@ -160,20 +174,20 @@ static char* allocFromUTF8(const char* in, size_t len) return getEmptyString(); } -// Note: not dealing with expanding surrogate pairs. -static char* allocFromUTF16(const char16_t* in, size_t len) +template +static char* allocFromUTF16OrUTF32(const T* in, L len) { if (len == 0) return getEmptyString(); - + size_t bytes = 0; - const char16_t* end = in+len; - const char16_t* p = in; - + const T* end = in+len; + const T* p = in; + while (p < end) { bytes += utf32_to_utf8_bytes(*p); p++; } - + SharedBuffer* buf = SharedBuffer::alloc(bytes+1); LOG_ASSERT(buf, "Unable to allocate shared buffer"); if (buf) { @@ -181,19 +195,30 @@ static char* allocFromUTF16(const char16_t* in, size_t len) char* str = (char*)buf->data(); char* d = str; while (p < end) { - uint32_t c = *p++; + const T c = *p++; size_t len = utf32_to_utf8_bytes(c); utf32_to_utf8((uint8_t*)d, c, len); d += len; } *d = 0; - + return str; } - + return getEmptyString(); } +// Note: not dealing with expanding surrogate pairs. +static char* allocFromUTF16(const char16_t* in, size_t len) +{ + return allocFromUTF16OrUTF32(in, len); +} + +static char* allocFromUTF32(const char32_t* in, size_t len) +{ + return allocFromUTF16OrUTF32(in, len); +} + // --------------------------------------------------------------------------- String8::String8() @@ -238,6 +263,16 @@ String8::String8(const char16_t* o, size_t len) { } +String8::String8(const char32_t* o) + : mString(allocFromUTF32(o, strlen32(o))) +{ +} + +String8::String8(const char32_t* o, size_t len) + : mString(allocFromUTF32(o, len)) +{ +} + String8::~String8() { SharedBuffer::bufferFromData(mString)->release(); @@ -280,6 +315,16 @@ status_t String8::setTo(const char16_t* other, size_t len) return NO_MEMORY; } +status_t String8::setTo(const char32_t* other, size_t len) +{ + SharedBuffer::bufferFromData(mString)->release(); + mString = allocFromUTF32(other, len); + if (mString) return NO_ERROR; + + mString = getEmptyString(); + return NO_MEMORY; +} + status_t String8::append(const String8& other) { const size_t otherLen = other.bytes(); @@ -418,6 +463,21 @@ void String8::toUpper(size_t start, size_t length) unlockBuffer(len); } +size_t String8::getUtf32Length() const +{ + return utf32_length(mString, length()); +} + +int32_t String8::getUtf32At(size_t index, size_t *next_index) const +{ + return utf32_at(mString, length(), index, next_index); +} + +size_t String8::getUtf32(char32_t* dst, size_t dst_len) const +{ + return utf8_to_utf32(mString, length(), dst, dst_len); +} + TextOutput& operator<<(TextOutput& to, const String8& val) { to << val.string(); @@ -427,7 +487,6 @@ TextOutput& operator<<(TextOutput& to, const String8& val) // --------------------------------------------------------------------------- // Path functions - void String8::setPathName(const char* name) { setPathName(name, strlen(name)); @@ -600,5 +659,192 @@ String8& String8::convertToResPath() return *this; } - }; // namespace android + +// --------------------------------------------------------------------------- + +size_t strlen32(const char32_t *s) +{ + const char32_t *ss = s; + while ( *ss ) + ss++; + return ss-s; +} + +size_t strnlen32(const char32_t *s, size_t maxlen) +{ + const char32_t *ss = s; + while ((maxlen > 0) && *ss) { + ss++; + maxlen--; + } + return ss-s; +} + +size_t utf8_codepoint_count(const char *src) +{ + const char *cur = src; + size_t ret = 0; + while (*cur != '\0') { + const char first_char = *cur++; + if ((first_char & 0x80) == 0) { // ASCII + ret += 1; + continue; + } + // (UTF-8's character must not be like 10xxxxxx, + // but 110xxxxx, 1110xxxx, ... or 1111110x) + if ((first_char & 0x40) == 0) { + return 0; + } + + int32_t mask, to_ignore_mask; + size_t num_to_read = 0; + char32_t utf32 = 0; + for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80; + num_to_read < 5 && (first_char & mask); + num_to_read++, to_ignore_mask |= mask, mask >>= 1) { + if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx + return 0; + } + // 0x3F == 00111111 + utf32 = (utf32 << 6) + (*cur++ & 0x3F); + } + // "first_char" must be (110xxxxx - 11110xxx) + if (num_to_read == 5) { + return 0; + } + to_ignore_mask |= mask; + utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1)); + if (utf32 > android::kUnicodeMaxCodepoint) { + return 0; + } + + ret += num_to_read; + } + return ret; +} + +size_t utf32_length(const char *src, size_t src_len) +{ + if (src == NULL || src_len == 0) { + return 0; + } + size_t ret = 0; + const char* cur; + const char* end; + size_t num_to_skip; + for (cur = src, end = src + src_len, num_to_skip = 1; + cur < end; + cur += num_to_skip, ret++) { + const char first_char = *cur; + num_to_skip = 1; + if ((first_char & 0x80) == 0) { // ASCII + continue; + } + int32_t mask; + + for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) { + } + } + return ret; +} + +size_t utf8_length_from_utf32(const char32_t *src, size_t src_len) +{ + if (src == NULL || src_len == 0) { + return 0; + } + size_t ret = 0; + const char32_t *end = src + src_len; + while (src < end) { + ret += android::utf32_to_utf8_bytes(*src++); + } + return ret; +} + +static int32_t utf32_at_internal(const char* cur, size_t *num_read) +{ + const char first_char = *cur; + if ((first_char & 0x80) == 0) { // ASCII + *num_read = 1; + return *cur; + } + cur++; + char32_t mask, to_ignore_mask; + size_t num_to_read = 0; + char32_t utf32 = first_char; + for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80; + (first_char & mask); + num_to_read++, to_ignore_mask |= mask, mask >>= 1) { + // 0x3F == 00111111 + utf32 = (utf32 << 6) + (*cur++ & 0x3F); + } + to_ignore_mask |= mask; + utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1))); + + *num_read = num_to_read; + return static_cast(utf32); +} + +int32_t utf32_at(const char *src, size_t src_len, + size_t index, size_t *next_index) +{ + if (index >= src_len) { + return -1; + } + size_t dummy_index; + if (next_index == NULL) { + next_index = &dummy_index; + } + size_t num_read; + int32_t ret = utf32_at_internal(src + index, &num_read); + if (ret >= 0) { + *next_index = index + num_read; + } + + return ret; +} + +size_t utf8_to_utf32(const char* src, size_t src_len, + char32_t* dst, size_t dst_len) +{ + if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { + return 0; + } + + const char* cur = src; + const char* end = src + src_len; + char32_t* cur_utf32 = dst; + const char32_t* end_utf32 = dst + dst_len; + while (cur_utf32 < end_utf32 && cur < end) { + size_t num_read; + *cur_utf32++ = + static_cast(utf32_at_internal(cur, &num_read)); + cur += num_read; + } + if (cur_utf32 < end_utf32) { + *cur_utf32 = 0; + } + return static_cast(cur_utf32 - dst); +} + +size_t utf32_to_utf8(const char32_t* src, size_t src_len, + char* dst, size_t dst_len) +{ + if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) { + return 0; + } + const char32_t *cur_utf32 = src; + const char32_t *end_utf32 = src + src_len; + char *cur = dst; + const char *end = dst + dst_len; + while (cur_utf32 < end_utf32 && cur < end) { + size_t len = android::utf32_to_utf8_bytes(*cur_utf32); + android::utf32_to_utf8((uint8_t *)cur, *cur_utf32++, len); + cur += len; + } + if (cur < end) { + *cur = '\0'; + } + return cur - dst; +} -- 2.11.0