OSDN Git Service

Reverted 321580: Added support for reading configuration files
[android-x86/external-llvm.git] / unittests / Support / ConvertUTFTest.cpp
1 //===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9
10 #include "llvm/Support/ConvertUTF.h"
11 #include "llvm/ADT/ArrayRef.h"
12 #include "gtest/gtest.h"
13 #include <string>
14 #include <vector>
15
16 using namespace llvm;
17
18 TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
19   // Src is the look of disapproval.
20   static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
21   ArrayRef<char> Ref(Src, sizeof(Src) - 1);
22   std::string Result;
23   bool Success = convertUTF16ToUTF8String(Ref, Result);
24   EXPECT_TRUE(Success);
25   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
26   EXPECT_EQ(Expected, Result);
27 }
28
29 TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
30   // Src is the look of disapproval.
31   static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
32   ArrayRef<char> Ref(Src, sizeof(Src) - 1);
33   std::string Result;
34   bool Success = convertUTF16ToUTF8String(Ref, Result);
35   EXPECT_TRUE(Success);
36   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
37   EXPECT_EQ(Expected, Result);
38 }
39
40 TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) {
41   // Src is the look of disapproval.
42   static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
43   StringRef Ref(Src, sizeof(Src) - 1);
44   SmallVector<UTF16, 5> Result;
45   bool Success = convertUTF8ToUTF16String(Ref, Result);
46   EXPECT_TRUE(Success);
47   static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0};
48   ASSERT_EQ(3u, Result.size());
49   for (int I = 0, E = 3; I != E; ++I)
50     EXPECT_EQ(Expected[I], Result[I]);
51 }
52
53 TEST(ConvertUTFTest, OddLengthInput) {
54   std::string Result;
55   bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
56   EXPECT_FALSE(Success);
57 }
58
59 TEST(ConvertUTFTest, Empty) {
60   std::string Result;
61   bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result);
62   EXPECT_TRUE(Success);
63   EXPECT_TRUE(Result.empty());
64 }
65
66 TEST(ConvertUTFTest, HasUTF16BOM) {
67   bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
68   EXPECT_TRUE(HasBOM);
69   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
70   EXPECT_TRUE(HasBOM);
71   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
72   EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
73   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
74   EXPECT_TRUE(HasBOM);
75
76   HasBOM = hasUTF16ByteOrderMark(None);
77   EXPECT_FALSE(HasBOM);
78   HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
79   EXPECT_FALSE(HasBOM);
80 }
81
82 TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) {
83   // Src is the look of disapproval.
84   static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
85   ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4);
86   std::string Result;
87   bool Success = convertUTF16ToUTF8String(SrcRef, Result);
88   EXPECT_TRUE(Success);
89   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
90   EXPECT_EQ(Expected, Result);
91 }
92
93 TEST(ConvertUTFTest, ConvertUTF8toWide) {
94   // Src is the look of disapproval.
95   static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
96   std::wstring Result;
97   bool Success = ConvertUTF8toWide((const char*)Src, Result);
98   EXPECT_TRUE(Success);
99   std::wstring Expected(L"\x0ca0_\x0ca0");
100   EXPECT_EQ(Expected, Result);
101   Result.clear();
102   Success = ConvertUTF8toWide(StringRef(Src, 7), Result);
103   EXPECT_TRUE(Success);
104   EXPECT_EQ(Expected, Result);
105 }
106
107 TEST(ConvertUTFTest, convertWideToUTF8) {
108   // Src is the look of disapproval.
109   static const wchar_t Src[] = L"\x0ca0_\x0ca0";
110   std::string Result;
111   bool Success = convertWideToUTF8(Src, Result);
112   EXPECT_TRUE(Success);
113   std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
114   EXPECT_EQ(Expected, Result);
115 }
116
117 struct ConvertUTFResultContainer {
118   ConversionResult ErrorCode;
119   std::vector<unsigned> UnicodeScalars;
120
121   ConvertUTFResultContainer(ConversionResult ErrorCode)
122       : ErrorCode(ErrorCode) {}
123
124   ConvertUTFResultContainer
125   withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
126               unsigned US2 = 0x110000, unsigned US3 = 0x110000,
127               unsigned US4 = 0x110000, unsigned US5 = 0x110000,
128               unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
129     ConvertUTFResultContainer Result(*this);
130     if (US0 != 0x110000)
131       Result.UnicodeScalars.push_back(US0);
132     if (US1 != 0x110000)
133       Result.UnicodeScalars.push_back(US1);
134     if (US2 != 0x110000)
135       Result.UnicodeScalars.push_back(US2);
136     if (US3 != 0x110000)
137       Result.UnicodeScalars.push_back(US3);
138     if (US4 != 0x110000)
139       Result.UnicodeScalars.push_back(US4);
140     if (US5 != 0x110000)
141       Result.UnicodeScalars.push_back(US5);
142     if (US6 != 0x110000)
143       Result.UnicodeScalars.push_back(US6);
144     if (US7 != 0x110000)
145       Result.UnicodeScalars.push_back(US7);
146     return Result;
147   }
148 };
149
150 std::pair<ConversionResult, std::vector<unsigned>>
151 ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
152   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
153
154   const UTF8 *SourceNext = SourceStart;
155   std::vector<UTF32> Decoded(S.size(), 0);
156   UTF32 *TargetStart = Decoded.data();
157
158   auto ErrorCode =
159       ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
160                          Decoded.data() + Decoded.size(), lenientConversion);
161
162   Decoded.resize(TargetStart - Decoded.data());
163
164   return std::make_pair(ErrorCode, Decoded);
165 }
166
167 std::pair<ConversionResult, std::vector<unsigned>>
168 ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
169   const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
170
171   const UTF8 *SourceNext = SourceStart;
172   std::vector<UTF32> Decoded(S.size(), 0);
173   UTF32 *TargetStart = Decoded.data();
174
175   auto ErrorCode = ConvertUTF8toUTF32Partial(
176       &SourceNext, SourceStart + S.size(), &TargetStart,
177       Decoded.data() + Decoded.size(), lenientConversion);
178
179   Decoded.resize(TargetStart - Decoded.data());
180
181   return std::make_pair(ErrorCode, Decoded);
182 }
183
184 ::testing::AssertionResult
185 CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
186                                  StringRef S, bool Partial = false) {
187   ConversionResult ErrorCode;
188   std::vector<unsigned> Decoded;
189   if (!Partial)
190     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
191   else
192     std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
193
194   if (Expected.ErrorCode != ErrorCode)
195     return ::testing::AssertionFailure() << "Expected error code "
196                                          << Expected.ErrorCode << ", actual "
197                                          << ErrorCode;
198
199   if (Expected.UnicodeScalars != Decoded)
200     return ::testing::AssertionFailure()
201            << "Expected lenient decoded result:\n"
202            << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
203            << "Actual result:\n" << ::testing::PrintToString(Decoded);
204
205   return ::testing::AssertionSuccess();
206 }
207
208 TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
209
210   //
211   // 1-byte sequences
212   //
213
214   // U+0041 LATIN CAPITAL LETTER A
215   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
216       ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
217
218   //
219   // 2-byte sequences
220   //
221
222   // U+0283 LATIN SMALL LETTER ESH
223   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
224       ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
225       "\xca\x83"));
226
227   // U+03BA GREEK SMALL LETTER KAPPA
228   // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
229   // U+03C3 GREEK SMALL LETTER SIGMA
230   // U+03BC GREEK SMALL LETTER MU
231   // U+03B5 GREEK SMALL LETTER EPSILON
232   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
233       ConvertUTFResultContainer(conversionOK)
234           .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
235       "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
236
237   //
238   // 3-byte sequences
239   //
240
241   // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
242   // U+6587 CJK UNIFIED IDEOGRAPH-6587
243   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
244       ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
245       "\xe4\xbe\x8b\xe6\x96\x87"));
246
247   // U+D55C HANGUL SYLLABLE HAN
248   // U+AE00 HANGUL SYLLABLE GEUL
249   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
250       ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
251       "\xed\x95\x9c\xea\xb8\x80"));
252
253   // U+1112 HANGUL CHOSEONG HIEUH
254   // U+1161 HANGUL JUNGSEONG A
255   // U+11AB HANGUL JONGSEONG NIEUN
256   // U+1100 HANGUL CHOSEONG KIYEOK
257   // U+1173 HANGUL JUNGSEONG EU
258   // U+11AF HANGUL JONGSEONG RIEUL
259   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
260       ConvertUTFResultContainer(conversionOK)
261           .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
262       "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
263       "\xe1\x86\xaf"));
264
265   //
266   // 4-byte sequences
267   //
268
269   // U+E0100 VARIATION SELECTOR-17
270   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
271       ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
272       "\xf3\xa0\x84\x80"));
273
274   //
275   // First possible sequence of a certain length
276   //
277
278   // U+0000 NULL
279   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
280       ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
281       StringRef("\x00", 1)));
282
283   // U+0080 PADDING CHARACTER
284   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
285       ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
286       "\xc2\x80"));
287
288   // U+0800 SAMARITAN LETTER ALAF
289   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
290       ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
291       "\xe0\xa0\x80"));
292
293   // U+10000 LINEAR B SYLLABLE B008 A
294   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
295       ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
296       "\xf0\x90\x80\x80"));
297
298   // U+200000 (invalid)
299   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
300       ConvertUTFResultContainer(sourceIllegal)
301           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
302       "\xf8\x88\x80\x80\x80"));
303
304   // U+4000000 (invalid)
305   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
306       ConvertUTFResultContainer(sourceIllegal)
307           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
308       "\xfc\x84\x80\x80\x80\x80"));
309
310   //
311   // Last possible sequence of a certain length
312   //
313
314   // U+007F DELETE
315   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
316       ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
317
318   // U+07FF (unassigned)
319   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
320       ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
321       "\xdf\xbf"));
322
323   // U+FFFF (noncharacter)
324   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
325       ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
326       "\xef\xbf\xbf"));
327
328   // U+1FFFFF (invalid)
329   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
330       ConvertUTFResultContainer(sourceIllegal)
331           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
332       "\xf7\xbf\xbf\xbf"));
333
334   // U+3FFFFFF (invalid)
335   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
336       ConvertUTFResultContainer(sourceIllegal)
337           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
338       "\xfb\xbf\xbf\xbf\xbf"));
339
340   // U+7FFFFFFF (invalid)
341   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
342       ConvertUTFResultContainer(sourceIllegal)
343           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
344       "\xfd\xbf\xbf\xbf\xbf\xbf"));
345
346   //
347   // Other boundary conditions
348   //
349
350   // U+D7FF (unassigned)
351   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
352       ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
353       "\xed\x9f\xbf"));
354
355   // U+E000 (private use)
356   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
357       ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
358       "\xee\x80\x80"));
359
360   // U+FFFD REPLACEMENT CHARACTER
361   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
362       ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
363       "\xef\xbf\xbd"));
364
365   // U+10FFFF (noncharacter)
366   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
367       ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
368       "\xf4\x8f\xbf\xbf"));
369
370   // U+110000 (invalid)
371   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
372       ConvertUTFResultContainer(sourceIllegal)
373           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
374       "\xf4\x90\x80\x80"));
375
376   //
377   // Unexpected continuation bytes
378   //
379
380   // A sequence of unexpected continuation bytes that don't follow a first
381   // byte, every byte is a maximal subpart.
382
383   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
384       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
385   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
386       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
387   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
388       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
389       "\x80\x80"));
390   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
391       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
392       "\x80\xbf"));
393   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
394       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
395       "\xbf\x80"));
396   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
397       ConvertUTFResultContainer(sourceIllegal)
398           .withScalars(0xfffd, 0xfffd, 0xfffd),
399       "\x80\xbf\x80"));
400   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
401       ConvertUTFResultContainer(sourceIllegal)
402           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
403       "\x80\xbf\x80\xbf"));
404   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
405       ConvertUTFResultContainer(sourceIllegal)
406           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
407       "\x80\xbf\x82\xbf\xaa"));
408   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
409       ConvertUTFResultContainer(sourceIllegal)
410           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
411       "\xaa\xb0\xbb\xbf\xaa\xa0"));
412   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
413       ConvertUTFResultContainer(sourceIllegal)
414           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
415       "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
416
417   // All continuation bytes (0x80--0xbf).
418   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
419       ConvertUTFResultContainer(sourceIllegal)
420           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
421                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
422           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
423                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
424           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
425                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
426           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
427                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
428           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
429                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
430           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
431                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
432           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
433                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
434           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
435                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
436       "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
437       "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
438       "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
439       "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
440
441   //
442   // Lonely start bytes
443   //
444
445   // Start bytes of 2-byte sequences (0xc0--0xdf).
446   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
447       ConvertUTFResultContainer(sourceIllegal)
448           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
449                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
450           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
451                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
452           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
453                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
454           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
455                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
456       "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
457       "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
458
459   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
460       ConvertUTFResultContainer(sourceIllegal)
461           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
462                        0xfffd, 0x0020, 0xfffd, 0x0020)
463           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
464                        0xfffd, 0x0020, 0xfffd, 0x0020)
465           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
466                        0xfffd, 0x0020, 0xfffd, 0x0020)
467           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
468                        0xfffd, 0x0020, 0xfffd, 0x0020)
469           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
470                        0xfffd, 0x0020, 0xfffd, 0x0020)
471           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
472                        0xfffd, 0x0020, 0xfffd, 0x0020)
473           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
474                        0xfffd, 0x0020, 0xfffd, 0x0020)
475           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
476                        0xfffd, 0x0020, 0xfffd, 0x0020),
477       "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
478       "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
479       "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
480       "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
481
482   // Start bytes of 3-byte sequences (0xe0--0xef).
483   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
484       ConvertUTFResultContainer(sourceIllegal)
485           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
486                        0xfffd, 0xfffd, 0xfffd, 0xfffd)
487           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
488                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
489       "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
490
491   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
492       ConvertUTFResultContainer(sourceIllegal)
493           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
494                        0xfffd, 0x0020, 0xfffd, 0x0020)
495           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
496                        0xfffd, 0x0020, 0xfffd, 0x0020)
497           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
498                        0xfffd, 0x0020, 0xfffd, 0x0020)
499           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
500                        0xfffd, 0x0020, 0xfffd, 0x0020),
501       "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
502       "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
503
504   // Start bytes of 4-byte sequences (0xf0--0xf7).
505   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
506       ConvertUTFResultContainer(sourceIllegal)
507           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
508                        0xfffd, 0xfffd, 0xfffd, 0xfffd),
509       "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
510
511   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
512       ConvertUTFResultContainer(sourceIllegal)
513           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
514                        0xfffd, 0x0020, 0xfffd, 0x0020)
515           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
516                        0xfffd, 0x0020, 0xfffd, 0x0020),
517       "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
518
519   // Start bytes of 5-byte sequences (0xf8--0xfb).
520   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
521       ConvertUTFResultContainer(sourceIllegal)
522           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
523       "\xf8\xf9\xfa\xfb"));
524
525   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
526       ConvertUTFResultContainer(sourceIllegal)
527           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
528                        0xfffd, 0x0020, 0xfffd, 0x0020),
529       "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
530
531   // Start bytes of 6-byte sequences (0xfc--0xfd).
532   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
533       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
534       "\xfc\xfd"));
535
536   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
537       ConvertUTFResultContainer(sourceIllegal)
538           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
539       "\xfc\x20\xfd\x20"));
540
541   //
542   // Other bytes (0xc0--0xc1, 0xfe--0xff).
543   //
544
545   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
546       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
547   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
548       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
549   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
550       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
551   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
552       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
553
554   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
555       ConvertUTFResultContainer(sourceIllegal)
556           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
557       "\xc0\xc1\xfe\xff"));
558
559   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
560       ConvertUTFResultContainer(sourceIllegal)
561           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
562       "\xfe\xfe\xff\xff"));
563
564   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
565       ConvertUTFResultContainer(sourceIllegal)
566           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
567       "\xfe\x80\x80\x80\x80\x80"));
568
569   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
570       ConvertUTFResultContainer(sourceIllegal)
571           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
572       "\xff\x80\x80\x80\x80\x80"));
573
574   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
575       ConvertUTFResultContainer(sourceIllegal)
576           .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
577                        0xfffd, 0x0020, 0xfffd, 0x0020),
578       "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
579
580   //
581   // Sequences with one continuation byte missing
582   //
583
584   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
585       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
586   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
587       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
588   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
589       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
590       "\xe0\xa0"));
591   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
592       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
593       "\xe0\xbf"));
594   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
595       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
596       "\xe1\x80"));
597   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
598       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
599       "\xec\xbf"));
600   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
601       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
602       "\xed\x80"));
603   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
604       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
605       "\xed\x9f"));
606   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
607       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
608       "\xee\x80"));
609   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
610       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
611       "\xef\xbf"));
612   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
613       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
614       "\xf0\x90\x80"));
615   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
616       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
617       "\xf0\xbf\xbf"));
618   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
620       "\xf1\x80\x80"));
621   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
622       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
623       "\xf3\xbf\xbf"));
624   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
625       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
626       "\xf4\x80\x80"));
627   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
628       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
629       "\xf4\x8f\xbf"));
630
631   // Overlong sequences with one trailing byte missing.
632   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
633       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
634       "\xc0"));
635   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
636       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
637       "\xc1"));
638   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
639       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
640       "\xe0\x80"));
641   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
642       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
643       "\xe0\x9f"));
644   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
645       ConvertUTFResultContainer(sourceIllegal)
646           .withScalars(0xfffd, 0xfffd, 0xfffd),
647       "\xf0\x80\x80"));
648   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
649       ConvertUTFResultContainer(sourceIllegal)
650           .withScalars(0xfffd, 0xfffd, 0xfffd),
651       "\xf0\x8f\x80"));
652   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
653       ConvertUTFResultContainer(sourceIllegal)
654           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
655       "\xf8\x80\x80\x80"));
656   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
657       ConvertUTFResultContainer(sourceIllegal)
658           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
659       "\xfc\x80\x80\x80\x80"));
660
661   // Sequences that represent surrogates with one trailing byte missing.
662   // High surrogates
663   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
664       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
665       "\xed\xa0"));
666   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
667       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
668       "\xed\xac"));
669   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
670       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
671       "\xed\xaf"));
672   // Low surrogates
673   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
674       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
675       "\xed\xb0"));
676   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
677       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
678       "\xed\xb4"));
679   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
680       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
681       "\xed\xbf"));
682
683   // Ill-formed 4-byte sequences.
684   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
685   // U+1100xx (invalid)
686   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
687       ConvertUTFResultContainer(sourceIllegal)
688           .withScalars(0xfffd, 0xfffd, 0xfffd),
689       "\xf4\x90\x80"));
690   // U+13FBxx (invalid)
691   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
692       ConvertUTFResultContainer(sourceIllegal)
693           .withScalars(0xfffd, 0xfffd, 0xfffd),
694       "\xf4\xbf\xbf"));
695   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
696       ConvertUTFResultContainer(sourceIllegal)
697           .withScalars(0xfffd, 0xfffd, 0xfffd),
698       "\xf5\x80\x80"));
699   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
700       ConvertUTFResultContainer(sourceIllegal)
701           .withScalars(0xfffd, 0xfffd, 0xfffd),
702       "\xf6\x80\x80"));
703   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
704       ConvertUTFResultContainer(sourceIllegal)
705           .withScalars(0xfffd, 0xfffd, 0xfffd),
706       "\xf7\x80\x80"));
707   // U+1FFBxx (invalid)
708   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
709       ConvertUTFResultContainer(sourceIllegal)
710           .withScalars(0xfffd, 0xfffd, 0xfffd),
711       "\xf7\xbf\xbf"));
712
713   // Ill-formed 5-byte sequences.
714   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
715   // U+2000xx (invalid)
716   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
717       ConvertUTFResultContainer(sourceIllegal)
718           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
719       "\xf8\x88\x80\x80"));
720   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
721       ConvertUTFResultContainer(sourceIllegal)
722           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
723       "\xf8\xbf\xbf\xbf"));
724   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
725       ConvertUTFResultContainer(sourceIllegal)
726           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
727       "\xf9\x80\x80\x80"));
728   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
729       ConvertUTFResultContainer(sourceIllegal)
730           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
731       "\xfa\x80\x80\x80"));
732   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
733       ConvertUTFResultContainer(sourceIllegal)
734           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
735       "\xfb\x80\x80\x80"));
736   // U+3FFFFxx (invalid)
737   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
738       ConvertUTFResultContainer(sourceIllegal)
739           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
740       "\xfb\xbf\xbf\xbf"));
741
742   // Ill-formed 6-byte sequences.
743   // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
744   // U+40000xx (invalid)
745   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
746       ConvertUTFResultContainer(sourceIllegal)
747           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
748       "\xfc\x84\x80\x80\x80"));
749   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
750       ConvertUTFResultContainer(sourceIllegal)
751           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
752       "\xfc\xbf\xbf\xbf\xbf"));
753   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
754       ConvertUTFResultContainer(sourceIllegal)
755           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
756       "\xfd\x80\x80\x80\x80"));
757   // U+7FFFFFxx (invalid)
758   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
759       ConvertUTFResultContainer(sourceIllegal)
760           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
761       "\xfd\xbf\xbf\xbf\xbf"));
762
763   //
764   // Sequences with two continuation bytes missing
765   //
766
767   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
768       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
769       "\xf0\x90"));
770   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
771       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
772       "\xf0\xbf"));
773   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
774       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
775       "\xf1\x80"));
776   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
777       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
778       "\xf3\xbf"));
779   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
780       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
781       "\xf4\x80"));
782   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
783       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
784       "\xf4\x8f"));
785
786   // Overlong sequences with two trailing byte missing.
787   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
788       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
789   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
790       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
791       "\xf0\x80"));
792   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
793       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
794       "\xf0\x8f"));
795   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
796       ConvertUTFResultContainer(sourceIllegal)
797           .withScalars(0xfffd, 0xfffd, 0xfffd),
798       "\xf8\x80\x80"));
799   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
800       ConvertUTFResultContainer(sourceIllegal)
801           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
802       "\xfc\x80\x80\x80"));
803
804   // Sequences that represent surrogates with two trailing bytes missing.
805   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
806       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
807
808   // Ill-formed 4-byte sequences.
809   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
810   // U+110yxx (invalid)
811   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
812       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
813       "\xf4\x90"));
814   // U+13Fyxx (invalid)
815   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
816       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
817       "\xf4\xbf"));
818   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
819       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
820       "\xf5\x80"));
821   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
822       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
823       "\xf6\x80"));
824   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
825       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
826       "\xf7\x80"));
827   // U+1FFyxx (invalid)
828   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
829       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
830       "\xf7\xbf"));
831
832   // Ill-formed 5-byte sequences.
833   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
834   // U+200yxx (invalid)
835   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
836       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
837       "\xf8\x88\x80"));
838   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
839       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
840       "\xf8\xbf\xbf"));
841   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
842       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
843       "\xf9\x80\x80"));
844   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
845       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
846       "\xfa\x80\x80"));
847   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
848       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
849       "\xfb\x80\x80"));
850   // U+3FFFyxx (invalid)
851   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
852       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
853       "\xfb\xbf\xbf"));
854
855   // Ill-formed 6-byte sequences.
856   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
857   // U+4000yxx (invalid)
858   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
859       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
860       "\xfc\x84\x80\x80"));
861   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
862       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
863       "\xfc\xbf\xbf\xbf"));
864   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
865       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
866       "\xfd\x80\x80\x80"));
867   // U+7FFFFyxx (invalid)
868   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
869       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
870       "\xfd\xbf\xbf\xbf"));
871
872   //
873   // Sequences with three continuation bytes missing
874   //
875
876   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
877       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
878   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
879       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
880   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
881       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
882   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
883       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
884   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
885       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
886
887   // Broken overlong sequences.
888   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
889       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
890   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
891       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
892       "\xf8\x80"));
893   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
894       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
895       "\xfc\x80\x80"));
896
897   // Ill-formed 4-byte sequences.
898   // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
899   // U+14yyxx (invalid)
900   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
901       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
902   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
903       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
904   // U+1Cyyxx (invalid)
905   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
906       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
907
908   // Ill-formed 5-byte sequences.
909   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
910   // U+20yyxx (invalid)
911   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
912       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
913       "\xf8\x88"));
914   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
915       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
916       "\xf8\xbf"));
917   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
918       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
919       "\xf9\x80"));
920   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
921       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
922       "\xfa\x80"));
923   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
924       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
925       "\xfb\x80"));
926   // U+3FCyyxx (invalid)
927   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
928       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
929       "\xfb\xbf"));
930
931   // Ill-formed 6-byte sequences.
932   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
933   // U+400yyxx (invalid)
934   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
935       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
936       "\xfc\x84\x80"));
937   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
938       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
939       "\xfc\xbf\xbf"));
940   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
941       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
942       "\xfd\x80\x80"));
943   // U+7FFCyyxx (invalid)
944   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
945       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
946       "\xfd\xbf\xbf"));
947
948   //
949   // Sequences with four continuation bytes missing
950   //
951
952   // Ill-formed 5-byte sequences.
953   // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
954   // U+uzyyxx (invalid)
955   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
956       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
957   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
958       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
959   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
960       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
961   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
962       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
963   // U+3zyyxx (invalid)
964   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
965       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
966
967   // Broken overlong sequences.
968   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
969       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
970   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
971       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
972       "\xfc\x80"));
973
974   // Ill-formed 6-byte sequences.
975   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
976   // U+uzzyyxx (invalid)
977   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
978       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
979       "\xfc\x84"));
980   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
981       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
982       "\xfc\xbf"));
983   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
984       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
985       "\xfd\x80"));
986   // U+7Fzzyyxx (invalid)
987   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
988       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
989       "\xfd\xbf"));
990
991   //
992   // Sequences with five continuation bytes missing
993   //
994
995   // Ill-formed 6-byte sequences.
996   // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
997   // U+uzzyyxx (invalid)
998   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
999       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
1000   // U+uuzzyyxx (invalid)
1001   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1002       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
1003
1004   //
1005   // Consecutive sequences with trailing bytes missing
1006   //
1007
1008   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1009       ConvertUTFResultContainer(sourceIllegal)
1010           .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1011           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1012           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
1013           .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
1014           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
1015           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1016       "\xc0" "\xe0\x80" "\xf0\x80\x80"
1017       "\xf8\x80\x80\x80"
1018       "\xfc\x80\x80\x80\x80"
1019       "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
1020       "\xfb\xbf\xbf\xbf"
1021       "\xfd\xbf\xbf\xbf\xbf"));
1022
1023   //
1024   // Overlong UTF-8 sequences
1025   //
1026
1027   // U+002F SOLIDUS
1028   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1029       ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
1030
1031   // Overlong sequences of the above.
1032   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1033       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1034       "\xc0\xaf"));
1035   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1036       ConvertUTFResultContainer(sourceIllegal)
1037           .withScalars(0xfffd, 0xfffd, 0xfffd),
1038       "\xe0\x80\xaf"));
1039   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1040       ConvertUTFResultContainer(sourceIllegal)
1041           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1042       "\xf0\x80\x80\xaf"));
1043   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1044       ConvertUTFResultContainer(sourceIllegal)
1045           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1046       "\xf8\x80\x80\x80\xaf"));
1047   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1048       ConvertUTFResultContainer(sourceIllegal)
1049           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1050       "\xfc\x80\x80\x80\x80\xaf"));
1051
1052   // U+0000 NULL
1053   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1054       ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1055       StringRef("\x00", 1)));
1056
1057   // Overlong sequences of the above.
1058   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1059       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1060       "\xc0\x80"));
1061   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1062       ConvertUTFResultContainer(sourceIllegal)
1063           .withScalars(0xfffd, 0xfffd, 0xfffd),
1064       "\xe0\x80\x80"));
1065   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1066       ConvertUTFResultContainer(sourceIllegal)
1067           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1068       "\xf0\x80\x80\x80"));
1069   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1070       ConvertUTFResultContainer(sourceIllegal)
1071           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1072       "\xf8\x80\x80\x80\x80"));
1073   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1074       ConvertUTFResultContainer(sourceIllegal)
1075           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1076       "\xfc\x80\x80\x80\x80\x80"));
1077
1078   // Other overlong sequences.
1079   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1080       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1081       "\xc0\xbf"));
1082   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1083       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1084       "\xc1\x80"));
1085   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1086       ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1087       "\xc1\xbf"));
1088   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1089       ConvertUTFResultContainer(sourceIllegal)
1090           .withScalars(0xfffd, 0xfffd, 0xfffd),
1091       "\xe0\x9f\xbf"));
1092   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1093       ConvertUTFResultContainer(sourceIllegal)
1094           .withScalars(0xfffd, 0xfffd, 0xfffd),
1095       "\xed\xa0\x80"));
1096   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1097       ConvertUTFResultContainer(sourceIllegal)
1098           .withScalars(0xfffd, 0xfffd, 0xfffd),
1099       "\xed\xbf\xbf"));
1100   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1101       ConvertUTFResultContainer(sourceIllegal)
1102           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1103       "\xf0\x8f\x80\x80"));
1104   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1105       ConvertUTFResultContainer(sourceIllegal)
1106           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1107       "\xf0\x8f\xbf\xbf"));
1108   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1109       ConvertUTFResultContainer(sourceIllegal)
1110           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1111       "\xf8\x87\xbf\xbf\xbf"));
1112   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113       ConvertUTFResultContainer(sourceIllegal)
1114           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1115       "\xfc\x83\xbf\xbf\xbf\xbf"));
1116
1117   //
1118   // Isolated surrogates
1119   //
1120
1121   // Unicode 6.3.0:
1122   //
1123   //    D71.  High-surrogate code point: A Unicode code point in the range
1124   //    U+D800 to U+DBFF.
1125   //
1126   //    D73.  Low-surrogate code point: A Unicode code point in the range
1127   //    U+DC00 to U+DFFF.
1128
1129   // Note: U+E0100 is <DB40 DD00> in UTF16.
1130
1131   // High surrogates
1132
1133   // U+D800
1134   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1135       ConvertUTFResultContainer(sourceIllegal)
1136           .withScalars(0xfffd, 0xfffd, 0xfffd),
1137       "\xed\xa0\x80"));
1138
1139   // U+DB40
1140   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1141       ConvertUTFResultContainer(sourceIllegal)
1142           .withScalars(0xfffd, 0xfffd, 0xfffd),
1143       "\xed\xac\xa0"));
1144
1145   // U+DBFF
1146   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1147       ConvertUTFResultContainer(sourceIllegal)
1148           .withScalars(0xfffd, 0xfffd, 0xfffd),
1149       "\xed\xaf\xbf"));
1150
1151   // Low surrogates
1152
1153   // U+DC00
1154   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1155       ConvertUTFResultContainer(sourceIllegal)
1156           .withScalars(0xfffd, 0xfffd, 0xfffd),
1157       "\xed\xb0\x80"));
1158
1159   // U+DD00
1160   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1161       ConvertUTFResultContainer(sourceIllegal)
1162           .withScalars(0xfffd, 0xfffd, 0xfffd),
1163       "\xed\xb4\x80"));
1164
1165   // U+DFFF
1166   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1167       ConvertUTFResultContainer(sourceIllegal)
1168           .withScalars(0xfffd, 0xfffd, 0xfffd),
1169       "\xed\xbf\xbf"));
1170
1171   // Surrogate pairs
1172
1173   // U+D800 U+DC00
1174   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1175       ConvertUTFResultContainer(sourceIllegal)
1176           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1177       "\xed\xa0\x80\xed\xb0\x80"));
1178
1179   // U+D800 U+DD00
1180   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1181       ConvertUTFResultContainer(sourceIllegal)
1182           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1183       "\xed\xa0\x80\xed\xb4\x80"));
1184
1185   // U+D800 U+DFFF
1186   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1187       ConvertUTFResultContainer(sourceIllegal)
1188           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1189       "\xed\xa0\x80\xed\xbf\xbf"));
1190
1191   // U+DB40 U+DC00
1192   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1193       ConvertUTFResultContainer(sourceIllegal)
1194           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1195       "\xed\xac\xa0\xed\xb0\x80"));
1196
1197   // U+DB40 U+DD00
1198   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1199       ConvertUTFResultContainer(sourceIllegal)
1200           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1201       "\xed\xac\xa0\xed\xb4\x80"));
1202
1203   // U+DB40 U+DFFF
1204   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1205       ConvertUTFResultContainer(sourceIllegal)
1206           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1207       "\xed\xac\xa0\xed\xbf\xbf"));
1208
1209   // U+DBFF U+DC00
1210   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1211       ConvertUTFResultContainer(sourceIllegal)
1212           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1213       "\xed\xaf\xbf\xed\xb0\x80"));
1214
1215   // U+DBFF U+DD00
1216   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1217       ConvertUTFResultContainer(sourceIllegal)
1218           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1219       "\xed\xaf\xbf\xed\xb4\x80"));
1220
1221   // U+DBFF U+DFFF
1222   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1223       ConvertUTFResultContainer(sourceIllegal)
1224           .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1225       "\xed\xaf\xbf\xed\xbf\xbf"));
1226
1227   //
1228   // Noncharacters
1229   //
1230
1231   // Unicode 6.3.0:
1232   //
1233   //    D14.  Noncharacter: A code point that is permanently reserved for
1234   //    internal use and that should never be interchanged. Noncharacters
1235   //    consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1236   //    and the values U+FDD0..U+FDEF.
1237
1238   // U+FFFE
1239   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1240       ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1241       "\xef\xbf\xbe"));
1242
1243   // U+FFFF
1244   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1245       ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1246       "\xef\xbf\xbf"));
1247
1248   // U+1FFFE
1249   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1250       ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1251       "\xf0\x9f\xbf\xbe"));
1252
1253   // U+1FFFF
1254   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1255       ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1256       "\xf0\x9f\xbf\xbf"));
1257
1258   // U+2FFFE
1259   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1260       ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1261       "\xf0\xaf\xbf\xbe"));
1262
1263   // U+2FFFF
1264   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1265       ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1266       "\xf0\xaf\xbf\xbf"));
1267
1268   // U+3FFFE
1269   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1270       ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1271       "\xf0\xbf\xbf\xbe"));
1272
1273   // U+3FFFF
1274   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1275       ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1276       "\xf0\xbf\xbf\xbf"));
1277
1278   // U+4FFFE
1279   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1280       ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1281       "\xf1\x8f\xbf\xbe"));
1282
1283   // U+4FFFF
1284   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1285       ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1286       "\xf1\x8f\xbf\xbf"));
1287
1288   // U+5FFFE
1289   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1290       ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1291       "\xf1\x9f\xbf\xbe"));
1292
1293   // U+5FFFF
1294   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1295       ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1296       "\xf1\x9f\xbf\xbf"));
1297
1298   // U+6FFFE
1299   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1300       ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1301       "\xf1\xaf\xbf\xbe"));
1302
1303   // U+6FFFF
1304   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1305       ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1306       "\xf1\xaf\xbf\xbf"));
1307
1308   // U+7FFFE
1309   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1310       ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1311       "\xf1\xbf\xbf\xbe"));
1312
1313   // U+7FFFF
1314   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1315       ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1316       "\xf1\xbf\xbf\xbf"));
1317
1318   // U+8FFFE
1319   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1320       ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1321       "\xf2\x8f\xbf\xbe"));
1322
1323   // U+8FFFF
1324   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1325       ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1326       "\xf2\x8f\xbf\xbf"));
1327
1328   // U+9FFFE
1329   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1330       ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1331       "\xf2\x9f\xbf\xbe"));
1332
1333   // U+9FFFF
1334   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1335       ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1336       "\xf2\x9f\xbf\xbf"));
1337
1338   // U+AFFFE
1339   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1340       ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1341       "\xf2\xaf\xbf\xbe"));
1342
1343   // U+AFFFF
1344   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1345       ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1346       "\xf2\xaf\xbf\xbf"));
1347
1348   // U+BFFFE
1349   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1350       ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1351       "\xf2\xbf\xbf\xbe"));
1352
1353   // U+BFFFF
1354   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1355       ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1356       "\xf2\xbf\xbf\xbf"));
1357
1358   // U+CFFFE
1359   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1360       ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1361       "\xf3\x8f\xbf\xbe"));
1362
1363   // U+CFFFF
1364   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1365       ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1366       "\xf3\x8f\xbf\xbf"));
1367
1368   // U+DFFFE
1369   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1370       ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1371       "\xf3\x9f\xbf\xbe"));
1372
1373   // U+DFFFF
1374   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1375       ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1376       "\xf3\x9f\xbf\xbf"));
1377
1378   // U+EFFFE
1379   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1380       ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1381       "\xf3\xaf\xbf\xbe"));
1382
1383   // U+EFFFF
1384   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1385       ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1386       "\xf3\xaf\xbf\xbf"));
1387
1388   // U+FFFFE
1389   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1390       ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1391       "\xf3\xbf\xbf\xbe"));
1392
1393   // U+FFFFF
1394   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1395       ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1396       "\xf3\xbf\xbf\xbf"));
1397
1398   // U+10FFFE
1399   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1400       ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1401       "\xf4\x8f\xbf\xbe"));
1402
1403   // U+10FFFF
1404   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1405       ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1406       "\xf4\x8f\xbf\xbf"));
1407
1408   // U+FDD0
1409   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1410       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1411       "\xef\xb7\x90"));
1412
1413   // U+FDD1
1414   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1415       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1416       "\xef\xb7\x91"));
1417
1418   // U+FDD2
1419   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1420       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1421       "\xef\xb7\x92"));
1422
1423   // U+FDD3
1424   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1425       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1426       "\xef\xb7\x93"));
1427
1428   // U+FDD4
1429   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1430       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1431       "\xef\xb7\x94"));
1432
1433   // U+FDD5
1434   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1435       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1436       "\xef\xb7\x95"));
1437
1438   // U+FDD6
1439   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1440       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1441       "\xef\xb7\x96"));
1442
1443   // U+FDD7
1444   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1445       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1446       "\xef\xb7\x97"));
1447
1448   // U+FDD8
1449   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1450       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1451       "\xef\xb7\x98"));
1452
1453   // U+FDD9
1454   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1455       ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1456       "\xef\xb7\x99"));
1457
1458   // U+FDDA
1459   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1460       ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1461       "\xef\xb7\x9a"));
1462
1463   // U+FDDB
1464   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1465       ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1466       "\xef\xb7\x9b"));
1467
1468   // U+FDDC
1469   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1470       ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1471       "\xef\xb7\x9c"));
1472
1473   // U+FDDD
1474   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1475       ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1476       "\xef\xb7\x9d"));
1477
1478   // U+FDDE
1479   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1480       ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1481       "\xef\xb7\x9e"));
1482
1483   // U+FDDF
1484   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1485       ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1486       "\xef\xb7\x9f"));
1487
1488   // U+FDE0
1489   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1490       ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1491       "\xef\xb7\xa0"));
1492
1493   // U+FDE1
1494   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1495       ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1496       "\xef\xb7\xa1"));
1497
1498   // U+FDE2
1499   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1500       ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1501       "\xef\xb7\xa2"));
1502
1503   // U+FDE3
1504   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1505       ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1506       "\xef\xb7\xa3"));
1507
1508   // U+FDE4
1509   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1510       ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1511       "\xef\xb7\xa4"));
1512
1513   // U+FDE5
1514   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1515       ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1516       "\xef\xb7\xa5"));
1517
1518   // U+FDE6
1519   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1520       ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1521       "\xef\xb7\xa6"));
1522
1523   // U+FDE7
1524   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1525       ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1526       "\xef\xb7\xa7"));
1527
1528   // U+FDE8
1529   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1530       ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1531       "\xef\xb7\xa8"));
1532
1533   // U+FDE9
1534   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1535       ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1536       "\xef\xb7\xa9"));
1537
1538   // U+FDEA
1539   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1540       ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1541       "\xef\xb7\xaa"));
1542
1543   // U+FDEB
1544   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1545       ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1546       "\xef\xb7\xab"));
1547
1548   // U+FDEC
1549   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1550       ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1551       "\xef\xb7\xac"));
1552
1553   // U+FDED
1554   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1555       ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1556       "\xef\xb7\xad"));
1557
1558   // U+FDEE
1559   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1560       ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1561       "\xef\xb7\xae"));
1562
1563   // U+FDEF
1564   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1565       ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1566       "\xef\xb7\xaf"));
1567
1568   // U+FDF0
1569   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1570       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1571       "\xef\xb7\xb0"));
1572
1573   // U+FDF1
1574   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1575       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1576       "\xef\xb7\xb1"));
1577
1578   // U+FDF2
1579   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1580       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1581       "\xef\xb7\xb2"));
1582
1583   // U+FDF3
1584   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1585       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1586       "\xef\xb7\xb3"));
1587
1588   // U+FDF4
1589   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1590       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1591       "\xef\xb7\xb4"));
1592
1593   // U+FDF5
1594   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1595       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1596       "\xef\xb7\xb5"));
1597
1598   // U+FDF6
1599   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1600       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1601       "\xef\xb7\xb6"));
1602
1603   // U+FDF7
1604   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1605       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1606       "\xef\xb7\xb7"));
1607
1608   // U+FDF8
1609   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1610       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1611       "\xef\xb7\xb8"));
1612
1613   // U+FDF9
1614   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1615       ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1616       "\xef\xb7\xb9"));
1617
1618   // U+FDFA
1619   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1620       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1621       "\xef\xb7\xba"));
1622
1623   // U+FDFB
1624   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1625       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1626       "\xef\xb7\xbb"));
1627
1628   // U+FDFC
1629   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1630       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1631       "\xef\xb7\xbc"));
1632
1633   // U+FDFD
1634   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1635       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1636       "\xef\xb7\xbd"));
1637
1638   // U+FDFE
1639   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1640       ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1641       "\xef\xb7\xbe"));
1642
1643   // U+FDFF
1644   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1645       ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1646       "\xef\xb7\xbf"));
1647 }
1648
1649 TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1650   // U+0041 LATIN CAPITAL LETTER A
1651   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1652       ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1653       "\x41", true));
1654
1655   //
1656   // Sequences with one continuation byte missing
1657   //
1658
1659   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1660       ConvertUTFResultContainer(sourceExhausted),
1661       "\xc2", true));
1662   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1663       ConvertUTFResultContainer(sourceExhausted),
1664       "\xdf", true));
1665   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1666       ConvertUTFResultContainer(sourceExhausted),
1667       "\xe0\xa0", true));
1668   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1669       ConvertUTFResultContainer(sourceExhausted),
1670       "\xe0\xbf", true));
1671   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1672       ConvertUTFResultContainer(sourceExhausted),
1673       "\xe1\x80", true));
1674   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1675       ConvertUTFResultContainer(sourceExhausted),
1676       "\xec\xbf", true));
1677   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1678       ConvertUTFResultContainer(sourceExhausted),
1679       "\xed\x80", true));
1680   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1681       ConvertUTFResultContainer(sourceExhausted),
1682       "\xed\x9f", true));
1683   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1684       ConvertUTFResultContainer(sourceExhausted),
1685       "\xee\x80", true));
1686   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1687       ConvertUTFResultContainer(sourceExhausted),
1688       "\xef\xbf", true));
1689   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1690       ConvertUTFResultContainer(sourceExhausted),
1691       "\xf0\x90\x80", true));
1692   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1693       ConvertUTFResultContainer(sourceExhausted),
1694       "\xf0\xbf\xbf", true));
1695   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1696       ConvertUTFResultContainer(sourceExhausted),
1697       "\xf1\x80\x80", true));
1698   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1699       ConvertUTFResultContainer(sourceExhausted),
1700       "\xf3\xbf\xbf", true));
1701   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1702       ConvertUTFResultContainer(sourceExhausted),
1703       "\xf4\x80\x80", true));
1704   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1705       ConvertUTFResultContainer(sourceExhausted),
1706       "\xf4\x8f\xbf", true));
1707
1708   EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1709       ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1710       "\x41\xc2", true));
1711 }
1712