From: Ivailo Monev Date: Thu, 28 Nov 2019 23:13:56 +0000 (+0000) Subject: reimplement JavaScriptCore UTF-8 conversion via QTextCodec X-Git-Tag: 4.12.0~4950 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=4da20dab1211807ec08bee40df35e5afc00584f2;p=kde%2FKatie.git reimplement JavaScriptCore UTF-8 conversion via QTextCodec Signed-off-by: Ivailo Monev --- diff --git a/src/3rdparty/javascriptcore/API/JSStringRef.cpp b/src/3rdparty/javascriptcore/API/JSStringRef.cpp index 3c53d012f..28751d103 100644 --- a/src/3rdparty/javascriptcore/API/JSStringRef.cpp +++ b/src/3rdparty/javascriptcore/API/JSStringRef.cpp @@ -30,6 +30,8 @@ #include "OpaqueJSString.h" #include +#include + using namespace JSC; using namespace WTF::Unicode; @@ -43,11 +45,12 @@ JSStringRef JSStringCreateWithUTF8CString(const char* string) { initializeThreading(); if (string) { - size_t length = strlen(string); - Vector buffer(length); - UChar* p = buffer.data(); - if (conversionOK == convertUTF8ToUTF16(&string, string + length, &p, p + length)) - return OpaqueJSString::create(buffer.data(), p - buffer.data()).releaseRef(); + QTextCodec *codec = QTextCodec::codecForName("UTF-8"); + QTextDecoder decoder(codec, QTextCodec::DefaultConversion); + QString result = decoder.toUnicode(string, strlen(string)); + if (!decoder.hasFailure()) { + return OpaqueJSString::create(reinterpret_cast(result.unicode()), result.size()).releaseRef(); + } } // Null string. @@ -86,14 +89,15 @@ size_t JSStringGetUTF8CString(JSStringRef string, char* buffer, size_t bufferSiz if (!bufferSize) return 0; - char* p = buffer; - const UChar* d = string->characters(); - ConversionResult result = convertUTF16ToUTF8(&d, d + string->length(), &p, p + bufferSize - 1, true); - *p++ = '\0'; - if (result != conversionOK && result != targetExhausted) + QTextCodec *codec = QTextCodec::codecForName("UTF-8"); + QTextEncoder encoder(codec, QTextCodec::DefaultConversion); + QByteArray result = encoder.fromUnicode(reinterpret_cast(string->characters()), string->length()); + if (encoder.hasFailure()) { + buffer = Q_NULLPTR; return 0; - - return p - buffer; + } + buffer = result.data(); + return result.size(); } bool JSStringIsEqual(JSStringRef a, JSStringRef b) diff --git a/src/3rdparty/javascriptcore/runtime/UString.cpp b/src/3rdparty/javascriptcore/runtime/UString.cpp index 9f6d3899a..e0d94fa4d 100644 --- a/src/3rdparty/javascriptcore/runtime/UString.cpp +++ b/src/3rdparty/javascriptcore/runtime/UString.cpp @@ -47,6 +47,8 @@ #include #endif +#include + using namespace WTF; using namespace WTF::Unicode; using namespace std; @@ -130,13 +132,14 @@ UString UString::createFromUTF8(const char* string) if (!string) return null(); - size_t length = strlen(string); - Vector buffer(length); - UChar* p = buffer.data(); - if (conversionOK != convertUTF8ToUTF16(&string, string + length, &p, p + length)) + QTextCodec *codec = QTextCodec::codecForName("UTF-8"); + QTextDecoder decoder(codec, QTextCodec::DefaultConversion); + QString result = decoder.toUnicode(string, strlen(string)); + if (decoder.hasFailure()) { return null(); + } - return UString(buffer.data(), p - buffer.data()); + return UString(reinterpret_cast(result.unicode()), result.size()); } UString UString::from(int i) @@ -780,18 +783,13 @@ bool equal(const UString::Rep* r, const UString::Rep* b) const char* UString::UTF8String(bool strict) const { - // Allocate a buffer big enough to hold all the characters. - const int length = size(); - Vector buffer(length * 3); - - // Convert to runs of 8-bit characters. - char* p = buffer.data(); - const UChar* d = &data()[0]; - ConversionResult result = convertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict); - if (result != conversionOK) - return 0; - - return buffer.data(); + QTextCodec *codec = QTextCodec::codecForName("UTF-8"); + QTextEncoder encoder(codec, QTextCodec::DefaultConversion); + QByteArray result = encoder.fromUnicode(reinterpret_cast(data()), size()); + if (encoder.hasFailure()) { + return Q_NULLPTR; + } + return result.constData(); } // For use in error handling code paths -- having this not be inlined helps avoid PIC branches to fetch the global on Mac OS X. diff --git a/src/3rdparty/javascriptcore/wtf/unicode/UTF8.cpp b/src/3rdparty/javascriptcore/wtf/unicode/UTF8.cpp index 6f4a7ef14..25bbfa7c6 100644 --- a/src/3rdparty/javascriptcore/wtf/unicode/UTF8.cpp +++ b/src/3rdparty/javascriptcore/wtf/unicode/UTF8.cpp @@ -113,192 +113,5 @@ int decodeUTF8Sequence(const char* sequence) return -1; } -// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed -// into the first byte, depending on how many bytes follow. There are -// as many entries in this table as there are UTF-8 sequence types. -// (I.e., one byte sequence, two byte... etc.). Remember that sequencs -// for *legal* UTF-8 will be 4 or fewer bytes total. -static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - -ConversionResult convertUTF16ToUTF8( - const UChar** sourceStart, const UChar* sourceEnd, - char** targetStart, char* targetEnd, bool strict) -{ - ConversionResult result = conversionOK; - const UChar* source = *sourceStart; - char* target = *targetStart; - while (source < sourceEnd) { - UChar32 ch; - unsigned short bytesToWrite = 0; - const UChar32 byteMask = 0xBF; - const UChar32 byteMark = 0x80; - const UChar* oldSource = source; // In case we have to back up because of target overflow. - ch = static_cast(*source++); - // If we have a surrogate pair, convert to UChar32 first. - if (ch >= 0xD800 && ch <= 0xDBFF) { - // If the 16 bits following the high surrogate are in the source buffer... - if (source < sourceEnd) { - UChar32 ch2 = static_cast(*source); - // If it's a low surrogate, convert to UChar32. - if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { - ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; - ++source; - } else if (strict) { // it's an unpaired high surrogate - --source; // return to the illegal value itself - result = sourceIllegal; - break; - } - } else { // We don't have the 16 bits following the high surrogate. - --source; // return to the high surrogate - result = sourceExhausted; - break; - } - } else if (strict) { - // UTF-16 surrogate values are illegal in UTF-32 - if (ch >= 0xDC00 && ch <= 0xDFFF) { - --source; // return to the illegal value itself - result = sourceIllegal; - break; - } - } - // Figure out how many bytes the result will require - if (ch < (UChar32)0x80) { - bytesToWrite = 1; - } else if (ch < (UChar32)0x800) { - bytesToWrite = 2; - } else if (ch < (UChar32)0x10000) { - bytesToWrite = 3; - } else if (ch < (UChar32)0x110000) { - bytesToWrite = 4; - } else { - bytesToWrite = 3; - ch = 0xFFFD; - } - - target += bytesToWrite; - if (target > targetEnd) { - source = oldSource; // Back up source pointer! - target -= bytesToWrite; - result = targetExhausted; - break; - } - switch (bytesToWrite) { // note: everything falls through. - case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; -} - -// This must be called with the length pre-determined by the first byte. -// If presented with a length > 4, this returns false. The Unicode -// definition of UTF-8 goes up to 4-byte sequences. -static bool isLegalUTF8(const unsigned char* source, int length) -{ - unsigned char a; - const unsigned char* srcptr = source + length; - switch (length) { - default: return false; - // Everything else falls through when "true"... - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) > 0xBF) return false; - - switch (*source) { - // no fall-through in this inner switch - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - } - if (*source > 0xF4) - return false; - return true; -} - -// Magic values subtracted from a buffer value during UTF8 conversion. -// This table contains as many values as there might be trailing bytes -// in a UTF-8 sequence. -static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, static_cast(0xFA082080UL), static_cast(0x82082080UL) }; - -ConversionResult convertUTF8ToUTF16( - const char** sourceStart, const char* sourceEnd, - UChar** targetStart, UChar* targetEnd, bool strict) -{ - ConversionResult result = conversionOK; - const char* source = *sourceStart; - UChar* target = *targetStart; - while (source < sourceEnd) { - UChar32 ch = 0; - int extraBytesToRead = UTF8SequenceLength(*source) - 1; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; - break; - } - // Do this check whether lenient or strict - if (!isLegalUTF8(reinterpret_cast(source), extraBytesToRead + 1)) { - result = sourceIllegal; - break; - } - // The cases all fall through. - switch (extraBytesToRead) { - case 5: ch += static_cast(*source++); ch <<= 6; // remember, illegal UTF-8 - case 4: ch += static_cast(*source++); ch <<= 6; // remember, illegal UTF-8 - case 3: ch += static_cast(*source++); ch <<= 6; - case 2: ch += static_cast(*source++); ch <<= 6; - case 1: ch += static_cast(*source++); ch <<= 6; - case 0: ch += static_cast(*source++); - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead + 1); // Back up source pointer! - result = targetExhausted; break; - } - if (ch <= 0xFFFF) { - // UTF-16 surrogate values are illegal in UTF-32 - if (ch >= 0xD800 && ch <= 0xDFFF) { - if (strict) { - source -= (extraBytesToRead + 1); // return to the illegal value itself - result = sourceIllegal; - break; - } else - *target++ = 0xFFFD; - } else - *target++ = (UChar)ch; // normal case - } else if (ch > 0x10FFFF) { - if (strict) { - result = sourceIllegal; - source -= (extraBytesToRead + 1); // return to the start - break; // Bail out; shouldn't continue - } else - *target++ = 0xFFFD; - } else { - // target is a character in range 0xFFFF - 0x10FFFF - if (target + 1 >= targetEnd) { - source -= (extraBytesToRead + 1); // Back up source pointer! - result = targetExhausted; - break; - } - ch -= 0x0010000UL; - *target++ = (UChar)((ch >> 10) + 0xD800); - *target++ = (UChar)((ch & 0x03FF) + 0xDC00); - } - } - *sourceStart = source; - *targetStart = target; - return result; -} - } } diff --git a/src/3rdparty/javascriptcore/wtf/unicode/UTF8.h b/src/3rdparty/javascriptcore/wtf/unicode/UTF8.h index a5ed93e94..3faeeb5af 100644 --- a/src/3rdparty/javascriptcore/wtf/unicode/UTF8.h +++ b/src/3rdparty/javascriptcore/wtf/unicode/UTF8.h @@ -40,35 +40,6 @@ namespace WTF { // Only allows Unicode characters (U-00000000 to U-0010FFFF). // Returns -1 if the sequence is not valid (including presence of extra bytes). int decodeUTF8Sequence(const char*); - - typedef enum { - conversionOK, // conversion successful - sourceExhausted, // partial character in source, but hit end - targetExhausted, // insuff. room in target for conversion - sourceIllegal // source sequence is illegal/malformed - } ConversionResult; - - // These conversion functions take a "strict" argument. When this - // flag is set to strict, both irregular sequences and isolated surrogates - // will cause an error. When the flag is set to lenient, both irregular - // sequences and isolated surrogates are converted. - // - // Whether the flag is strict or lenient, all illegal sequences will cause - // an error return. This includes sequences such as: , , - // or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code - // must check for illegal sequences. - // - // When the flag is set to lenient, characters over 0x10FFFF are converted - // to the replacement character; otherwise (when the flag is set to strict) - // they constitute an error. - - ConversionResult convertUTF8ToUTF16( - const char** sourceStart, const char* sourceEnd, - UChar** targetStart, UChar* targetEnd, bool strict = true); - - ConversionResult convertUTF16ToUTF8( - const UChar** sourceStart, const UChar* sourceEnd, - char** targetStart, char* targetEnd, bool strict = true); } }