2 * Copyright (C) 2002 by Red Hat, Incorporated. All rights reserved.
4 * Permission to use, copy, modify, and distribute this software
5 * is freely granted, provided that this notice is preserved.
7 * Tests gleaned from Markus Kuhn's UTF-8 and Unicode FAQ,
8 * and specifically, his UTF-8-test.txt decoder stress test file.
17 int num_invalid(const char *s, int len);
20 {0x0}, /* U-00000000 */
21 {0xc2, 0x80}, /* U-00000080 */
22 {0xe0, 0xa0, 0x80}, /* U-00000800 */
23 {0xf0, 0x90, 0x80, 0x80}, /* U-00010000 */
24 {0xf8, 0x88, 0x80, 0x80, 0x80}, /* U-00200000 */
25 {0xfc, 0x84, 0x80, 0x80, 0x80, 0x80} /* U-04000000 */
29 {0x7f}, /* U-0000007F */
30 {0xdf, 0xbf}, /* U-000007FF */
31 {0xef, 0xbf, 0xbf}, /* U-0000FFFF */
32 {0xf7, 0xbf, 0xbf, 0xbf}, /* U-001FFFFF */
33 {0xfb, 0xbf, 0xbf, 0xbf, 0xbf}, /* U-03FFFFFF */
34 {0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf} /* U-7FFFFFFF */
37 char boundary[5][6] = {
38 {0xed, 0x9f, 0xbf}, /* U-0000D7FF */
39 {0xee, 0x80, 0x80}, /* U-0000E000 */
40 {0xef, 0xbf, 0xbd}, /* U-0000FFFD */
41 {0xf4, 0x8f, 0xbf, 0xbf}, /* U-0010FFFF */
42 {0xf4, 0x90, 0x80, 0x80} /* U-00110000 */
45 char continuation_bytes[8][7] = {
50 {0x80, 0xbf, 0x80, 0xbf},
51 {0x80, 0xbf, 0x80, 0xbf, 0x80},
52 {0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf},
53 {0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 0x80}
56 char all_continuation_bytes[64];
59 char all_two_byte_seq[32];
60 char all_three_byte_seq[16];
61 char all_four_byte_seq[8];
62 char all_five_byte_seq[4];
63 char all_six_byte_seq[2];
65 char incomplete_seq[10][6] = {
66 {0xc2}, /* U-00000080 */
67 {0xe0, 0x80}, /* U-00000800 */
68 {0xf0, 0x80, 0x80}, /* U-00010000 */
69 {0xf8, 0x80, 0x80, 0x80}, /* U-00200000 */
70 {0xfc, 0x80, 0x80, 0x80, 0x80}, /* U-04000000 */
71 {0xdf}, /* U-000007FF */
72 {0xef, 0xbf}, /* U-0000FFFF */
73 {0xf7, 0xbf, 0xbf}, /* U-001FFFFF */
74 {0xfb, 0xbf, 0xbf, 0xbf}, /* U-03FFFFFF */
75 {0xfd, 0xbf, 0xbf, 0xbf, 0xbf} /* U-7FFFFFFF */
78 char incomplete_seq_concat[30];
80 char impossible_bytes[3][4] = {
83 {0xfe, 0xfe, 0xff, 0xff}
86 char overlong[5][6] = {
89 {0xf0, 0x80, 0x80, 0xaf},
90 {0xf8, 0x80, 0x80, 0x80, 0xaf},
91 {0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf}
94 char overlong_max[5][6] = {
97 {0xf0, 0x8f, 0xbf, 0xbf},
98 {0xf8, 0x87, 0xbf, 0xbf, 0xbf},
99 {0xfc, 0x83, 0xbf, 0xbf, 0xbf, 0xbf}
102 char overlong_nul[5][6] = {
105 {0xf0, 0x80, 0x80, 0x80},
106 {0xf8, 0x80, 0x80, 0x80, 0x80},
107 {0xfc, 0x80, 0x80, 0x80, 0x80, 0x80}
110 char single_surrogates[7][3] = {
120 char paired_surrogates[8][6] = {
121 {0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80},
122 {0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf},
123 {0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80},
124 {0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf},
125 {0xed, 0xae, 0x80, 0xed, 0xb0, 0x80},
126 {0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf},
127 {0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80},
128 {0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf}
131 char illegal_pos[2][3] = {
142 if (!setlocale(LC_CTYPE, "C-UTF-8"))
144 printf("Failed to set C-UTF-8 locale.\n");
148 printf("Set C-UTF-8 locale.\n");
150 /* 2 Boundary condition test cases */
151 /* 2.1 First possible sequence of a certain length */
152 retval = mbtowc(&wchar, first[0], MAX_BYTES);
154 printf("2.1.1: U-%08d\n", wchar);
156 printf("2.1.1: Invalid\n");
158 for (i = 2; i < 7; i++)
160 retval = mbtowc (&wchar, first[i-1], MAX_BYTES);
162 printf("2.1.%d: U-%08x\n", i, wchar);
164 printf("2.1.%d: Invalid\n", i);
167 /* 2.2 Last possible sequence of a certain length */
168 for (i = 1; i < 7; i++)
170 retval = mbtowc (&wchar, last[i-1], MAX_BYTES);
172 printf("2.2.%d: U-%08x\n", i, wchar);
174 printf("2.2.%d: Invalid\n", i);
177 /* 2.3 Other boundary conditions */
178 for (i = 1; i < 6; i++)
180 retval = mbtowc (&wchar, boundary[i-1], MAX_BYTES);
181 if ((i < 4 && retval == 3) || (i > 3 && retval == 4))
182 printf("2.3.%d: U-%08x\n", i, wchar);
184 printf("2.3.%d: Invalid\n", i);
187 /* 3 Malformed sequences */
188 /* 3.1 Unexpected continuation bytes */
189 retval = mbtowc (&wchar, continuation_bytes[0], MAX_BYTES);
191 printf("3.1.1: U-%08x\n", wchar);
193 printf("3.1.1: 1 Invalid\n");
195 retval = mbtowc (&wchar, continuation_bytes[1], MAX_BYTES);
197 printf("3.1.2: U-%08x\n", wchar);
199 printf("3.1.2: 1 Invalid\n");
203 retval = num_invalid(continuation_bytes[i], i);
205 printf("3.1.%d: Valid Character Found\n", i+1);
207 printf("3.1.%d: %d Invalid\n", i+1, retval);
210 for(i = 0x80; i < 0xc0; i++)
211 all_continuation_bytes[i-0x80] = i;
213 retval = num_invalid(all_continuation_bytes, 0xc0 - 0x80);
215 printf("3.1.9: Valid Character Found\n");
217 printf("3.1.9: %d Invalid\n", retval);
219 /* 3.2 Lonely start characters */
220 for(i = 0xc0; i < 0xe0; i++)
221 all_two_byte_seq[i-0xc0] = i;
223 retval = num_invalid(all_two_byte_seq, 0xe0 - 0xc0);
225 printf("3.2.1: Valid Character Found\n");
227 printf("3.2.1: %d Invalid\n", retval);
229 for(i = 0xe0; i < 0xf0; i++)
230 all_three_byte_seq[i-0xe0] = i;
232 retval = num_invalid(all_three_byte_seq, 0xf0 - 0xe0);
234 printf("3.2.2: Valid Character Found\n");
236 printf("3.2.2: %d Invalid\n", retval);
238 for(i = 0xf0; i < 0xf8; i++)
239 all_four_byte_seq[i-0xf0] = i;
241 retval = num_invalid(all_four_byte_seq, 0xf8 - 0xf0);
243 printf("3.2.3: Valid Character Found\n");
245 printf("3.2.3: %d Invalid\n", retval);
247 for(i = 0xf8; i < 0xfc; i++)
248 all_five_byte_seq[i-0xf8] = i;
250 retval = num_invalid(all_five_byte_seq, 0xfc - 0xf8);
252 printf("3.2.4: Valid Character Found\n");
254 printf("3.2.4: %d Invalid\n", retval);
256 for(i = 0xfc; i < 0xfe; i++)
257 all_six_byte_seq[i-0xfc] = i;
259 retval = num_invalid(all_six_byte_seq, 0xfe - 0xfc);
261 printf("3.2.5: Valid Character Found\n");
263 printf("3.2.5: %d Invalid\n", retval);
265 /* 3.3 Sequences with last continuation byte missing */
266 for(i = 1; i < 6; i++)
268 retval = mbtowc(&wchar, incomplete_seq[i-1], i);
270 printf("3.3.%d: 1 Invalid\n", i);
272 printf("3.3.%d: Valid Character Found\n", i);
275 for(i = 6; i < 11; i++)
277 retval = mbtowc(&wchar, incomplete_seq[i-1], i - 5);
279 printf("3.3.%d: 1 Invalid\n", i);
281 printf("3.3.%d: Valid Character Found\n", i);
284 /* 3.4 Concatenation of incomplete sequences */
285 /* This test is excluded because the mbtowc function does not return the
286 number of bytes read in an invalid multi-byte sequence. */
288 /* 3.5 Impossible bytes */
289 retval = mbtowc(&wchar, impossible_bytes[0], 1);
291 printf("3.5.1: 1 Invalid\n");
293 printf("3.5.1: Valid Character Found\n");
295 retval = mbtowc(&wchar, impossible_bytes[1], 1);
297 printf("3.5.2: 1 Invalid\n");
299 printf("3.5.2: Valid Character Found\n");
301 retval = mbtowc(&wchar, impossible_bytes[2], 4);
303 printf("3.5.3: 1 Invalid\n");
305 printf("3.5.3: Valid Character Found\n");
307 /* 4 Overlong sequences */
308 /* 4.1 Examples of an overlong ASCII character */
309 for(i = 2; i < 7; i++)
311 retval = mbtowc(&wchar, overlong[i-2], i);
313 printf("4.1.%d: 1 Invalid\n", i-1);
315 printf("4.1.%d: Valid Character Found\n", i-1);
318 /* 4.2 Maximum overlong sequences */
319 for(i = 2; i < 7; i++)
321 retval = mbtowc(&wchar, overlong_max[i-2], i);
323 printf("4.2.%d: 1 Invalid\n", i-1);
325 printf("4.2.%d: Valid Character Found\n", i-1);
328 /* 4.3 Overlong representation of the NUL character */
329 for(i = 2; i < 7; i++)
331 retval = mbtowc(&wchar, overlong_nul[i-2], i);
333 printf("4.3.%d: 1 Invalid\n", i-1);
335 printf("4.3.%d: Valid Character Found\n", i-1);
338 /* 5 Illegal code positions */
339 /* 5.1 Single UTF-16 surrogates */
340 for (i = 1; i < 8; i++)
342 retval = mbtowc(&wchar, single_surrogates[i-1], 3);
344 printf("5.1.%d: 1 Invalid\n", i);
346 printf("5.1.%d: Valid Character Found\n", i);
349 /* 5.2 Paired UTF-16 surrogates */
350 for (i = 1; i < 8; i++)
352 retval = mbtowc(&wchar, paired_surrogates[i-1], 6);
354 printf("5.2.%d: 1 Invalid\n", i);
356 printf("5.2.%d: Valid Character Found\n", i);
359 /* 5.3 Other illegal code positions */
360 retval = mbtowc(&wchar, illegal_pos[0], 3);
362 printf("5.3.1: 1 Invalid\n");
364 printf("5.3.1: Valid Character Found\n");
366 retval = mbtowc(&wchar, illegal_pos[1], 3);
368 printf("5.3.2: 1 Invalid\n");
370 printf("5.3.2: Valid Character Found\n");
375 /* return number of invalid characters in string,
376 returns -1 if a valid character is found */
378 num_invalid(const char *s, int len)
388 for(i=0; i<len; t++, i++)
390 retval = mbtowc (&wchar, t, len - i);