#include "OpaqueJSString.h"
#include <wtf/unicode/UTF8.h>
+#include <QTextCodec>
+
using namespace JSC;
using namespace WTF::Unicode;
{
initializeThreading();
if (string) {
- size_t length = strlen(string);
- Vector<UChar, 1024> buffer(length);
- UChar* p = buffer.data();
- if (conversionOK == convertUTF8ToUTF16(&string, string + length, &p, p + length))
- return OpaqueJSString::create(buffer.data(), p - buffer.data()).releaseRef();
+ QTextCodec *codec = QTextCodec::codecForName("UTF-8");
+ QTextDecoder decoder(codec, QTextCodec::DefaultConversion);
+ QString result = decoder.toUnicode(string, strlen(string));
+ if (!decoder.hasFailure()) {
+ return OpaqueJSString::create(reinterpret_cast<const UChar*>(result.unicode()), result.size()).releaseRef();
+ }
}
// Null string.
if (!bufferSize)
return 0;
- char* p = buffer;
- const UChar* d = string->characters();
- ConversionResult result = convertUTF16ToUTF8(&d, d + string->length(), &p, p + bufferSize - 1, true);
- *p++ = '\0';
- if (result != conversionOK && result != targetExhausted)
+ QTextCodec *codec = QTextCodec::codecForName("UTF-8");
+ QTextEncoder encoder(codec, QTextCodec::DefaultConversion);
+ QByteArray result = encoder.fromUnicode(reinterpret_cast<const QChar*>(string->characters()), string->length());
+ if (encoder.hasFailure()) {
+ buffer = Q_NULLPTR;
return 0;
-
- return p - buffer;
+ }
+ buffer = result.data();
+ return result.size();
}
bool JSStringIsEqual(JSStringRef a, JSStringRef b)
#include <strings.h>
#endif
+#include <QTextCodec>
+
using namespace WTF;
using namespace WTF::Unicode;
using namespace std;
if (!string)
return null();
- size_t length = strlen(string);
- Vector<UChar, 1024> buffer(length);
- UChar* p = buffer.data();
- if (conversionOK != convertUTF8ToUTF16(&string, string + length, &p, p + length))
+ QTextCodec *codec = QTextCodec::codecForName("UTF-8");
+ QTextDecoder decoder(codec, QTextCodec::DefaultConversion);
+ QString result = decoder.toUnicode(string, strlen(string));
+ if (decoder.hasFailure()) {
return null();
+ }
- return UString(buffer.data(), p - buffer.data());
+ return UString(reinterpret_cast<const UChar*>(result.unicode()), result.size());
}
UString UString::from(int i)
const char* UString::UTF8String(bool strict) const
{
- // Allocate a buffer big enough to hold all the characters.
- const int length = size();
- Vector<char, 1024> buffer(length * 3);
-
- // Convert to runs of 8-bit characters.
- char* p = buffer.data();
- const UChar* d = &data()[0];
- ConversionResult result = convertUTF16ToUTF8(&d, d + length, &p, p + buffer.size(), strict);
- if (result != conversionOK)
- return 0;
-
- return buffer.data();
+ QTextCodec *codec = QTextCodec::codecForName("UTF-8");
+ QTextEncoder encoder(codec, QTextCodec::DefaultConversion);
+ QByteArray result = encoder.fromUnicode(reinterpret_cast<const QChar*>(data()), size());
+ if (encoder.hasFailure()) {
+ return Q_NULLPTR;
+ }
+ return result.constData();
}
// For use in error handling code paths -- having this not be inlined helps avoid PIC branches to fetch the global on Mac OS X.
return -1;
}
-// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
-// into the first byte, depending on how many bytes follow. There are
-// as many entries in this table as there are UTF-8 sequence types.
-// (I.e., one byte sequence, two byte... etc.). Remember that sequencs
-// for *legal* UTF-8 will be 4 or fewer bytes total.
-static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
-
-ConversionResult convertUTF16ToUTF8(
- const UChar** sourceStart, const UChar* sourceEnd,
- char** targetStart, char* targetEnd, bool strict)
-{
- ConversionResult result = conversionOK;
- const UChar* source = *sourceStart;
- char* target = *targetStart;
- while (source < sourceEnd) {
- UChar32 ch;
- unsigned short bytesToWrite = 0;
- const UChar32 byteMask = 0xBF;
- const UChar32 byteMark = 0x80;
- const UChar* oldSource = source; // In case we have to back up because of target overflow.
- ch = static_cast<unsigned short>(*source++);
- // If we have a surrogate pair, convert to UChar32 first.
- if (ch >= 0xD800 && ch <= 0xDBFF) {
- // If the 16 bits following the high surrogate are in the source buffer...
- if (source < sourceEnd) {
- UChar32 ch2 = static_cast<unsigned short>(*source);
- // If it's a low surrogate, convert to UChar32.
- if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
- ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
- ++source;
- } else if (strict) { // it's an unpaired high surrogate
- --source; // return to the illegal value itself
- result = sourceIllegal;
- break;
- }
- } else { // We don't have the 16 bits following the high surrogate.
- --source; // return to the high surrogate
- result = sourceExhausted;
- break;
- }
- } else if (strict) {
- // UTF-16 surrogate values are illegal in UTF-32
- if (ch >= 0xDC00 && ch <= 0xDFFF) {
- --source; // return to the illegal value itself
- result = sourceIllegal;
- break;
- }
- }
- // Figure out how many bytes the result will require
- if (ch < (UChar32)0x80) {
- bytesToWrite = 1;
- } else if (ch < (UChar32)0x800) {
- bytesToWrite = 2;
- } else if (ch < (UChar32)0x10000) {
- bytesToWrite = 3;
- } else if (ch < (UChar32)0x110000) {
- bytesToWrite = 4;
- } else {
- bytesToWrite = 3;
- ch = 0xFFFD;
- }
-
- target += bytesToWrite;
- if (target > targetEnd) {
- source = oldSource; // Back up source pointer!
- target -= bytesToWrite;
- result = targetExhausted;
- break;
- }
- switch (bytesToWrite) { // note: everything falls through.
- case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
- case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
- case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
- case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]);
- }
- target += bytesToWrite;
- }
- *sourceStart = source;
- *targetStart = target;
- return result;
-}
-
-// This must be called with the length pre-determined by the first byte.
-// If presented with a length > 4, this returns false. The Unicode
-// definition of UTF-8 goes up to 4-byte sequences.
-static bool isLegalUTF8(const unsigned char* source, int length)
-{
- unsigned char a;
- const unsigned char* srcptr = source + length;
- switch (length) {
- default: return false;
- // Everything else falls through when "true"...
- case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
- case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
- case 2: if ((a = (*--srcptr)) > 0xBF) return false;
-
- switch (*source) {
- // no fall-through in this inner switch
- case 0xE0: if (a < 0xA0) return false; break;
- case 0xED: if (a > 0x9F) return false; break;
- case 0xF0: if (a < 0x90) return false; break;
- case 0xF4: if (a > 0x8F) return false; break;
- default: if (a < 0x80) return false;
- }
-
- case 1: if (*source >= 0x80 && *source < 0xC2) return false;
- }
- if (*source > 0xF4)
- return false;
- return true;
-}
-
-// Magic values subtracted from a buffer value during UTF8 conversion.
-// This table contains as many values as there might be trailing bytes
-// in a UTF-8 sequence.
-static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
- 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82082080UL) };
-
-ConversionResult convertUTF8ToUTF16(
- const char** sourceStart, const char* sourceEnd,
- UChar** targetStart, UChar* targetEnd, bool strict)
-{
- ConversionResult result = conversionOK;
- const char* source = *sourceStart;
- UChar* target = *targetStart;
- while (source < sourceEnd) {
- UChar32 ch = 0;
- int extraBytesToRead = UTF8SequenceLength(*source) - 1;
- if (source + extraBytesToRead >= sourceEnd) {
- result = sourceExhausted;
- break;
- }
- // Do this check whether lenient or strict
- if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) {
- result = sourceIllegal;
- break;
- }
- // The cases all fall through.
- switch (extraBytesToRead) {
- case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
- case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
- case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6;
- case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6;
- case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6;
- case 0: ch += static_cast<unsigned char>(*source++);
- }
- ch -= offsetsFromUTF8[extraBytesToRead];
-
- if (target >= targetEnd) {
- source -= (extraBytesToRead + 1); // Back up source pointer!
- result = targetExhausted; break;
- }
- if (ch <= 0xFFFF) {
- // UTF-16 surrogate values are illegal in UTF-32
- if (ch >= 0xD800 && ch <= 0xDFFF) {
- if (strict) {
- source -= (extraBytesToRead + 1); // return to the illegal value itself
- result = sourceIllegal;
- break;
- } else
- *target++ = 0xFFFD;
- } else
- *target++ = (UChar)ch; // normal case
- } else if (ch > 0x10FFFF) {
- if (strict) {
- result = sourceIllegal;
- source -= (extraBytesToRead + 1); // return to the start
- break; // Bail out; shouldn't continue
- } else
- *target++ = 0xFFFD;
- } else {
- // target is a character in range 0xFFFF - 0x10FFFF
- if (target + 1 >= targetEnd) {
- source -= (extraBytesToRead + 1); // Back up source pointer!
- result = targetExhausted;
- break;
- }
- ch -= 0x0010000UL;
- *target++ = (UChar)((ch >> 10) + 0xD800);
- *target++ = (UChar)((ch & 0x03FF) + 0xDC00);
- }
- }
- *sourceStart = source;
- *targetStart = target;
- return result;
-}
-
}
}
// Only allows Unicode characters (U-00000000 to U-0010FFFF).
// Returns -1 if the sequence is not valid (including presence of extra bytes).
int decodeUTF8Sequence(const char*);
-
- typedef enum {
- conversionOK, // conversion successful
- sourceExhausted, // partial character in source, but hit end
- targetExhausted, // insuff. room in target for conversion
- sourceIllegal // source sequence is illegal/malformed
- } ConversionResult;
-
- // These conversion functions take a "strict" argument. When this
- // flag is set to strict, both irregular sequences and isolated surrogates
- // will cause an error. When the flag is set to lenient, both irregular
- // sequences and isolated surrogates are converted.
- //
- // Whether the flag is strict or lenient, all illegal sequences will cause
- // an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
- // or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
- // must check for illegal sequences.
- //
- // When the flag is set to lenient, characters over 0x10FFFF are converted
- // to the replacement character; otherwise (when the flag is set to strict)
- // they constitute an error.
-
- ConversionResult convertUTF8ToUTF16(
- const char** sourceStart, const char* sourceEnd,
- UChar** targetStart, UChar* targetEnd, bool strict = true);
-
- ConversionResult convertUTF16ToUTF8(
- const UChar** sourceStart, const UChar* sourceEnd,
- char** targetStart, char* targetEnd, bool strict = true);
}
}