2 * Copyright (C) 2000-2006 Erik Andersen <andersen@uclibc.org>
4 * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
23 #include UCLIBC_CTYPE_HEADER
25 /* 0x9 : space blank */
30 /* 0x20 : space blank */
31 /* 0x1680 : space blank */
32 /* 0x2000 : space blank */
33 /* 0x2001 : space blank */
34 /* 0x2002 : space blank */
35 /* 0x2003 : space blank */
36 /* 0x2004 : space blank */
37 /* 0x2005 : space blank */
38 /* 0x2006 : space blank */
39 /* 0x2008 : space blank */
40 /* 0x2009 : space blank */
41 /* 0x200a : space blank */
42 /* 0x200b : space blank */
45 /* 0x3000 : space blank */
47 /* typecount[ 0] = 88670 C_alpha_nonupper_nonlower */
48 /* typecount[ 1] = 742 C_alpha_lower */
49 /* typecount[ 2] = 4 C_alpha_upper_lower */
50 /* typecount[ 3] = 731 C_alpha_upper */
51 /* typecount[ 4] = 10 C_digit */
52 /* typecount[ 5] = 10270 C_punct */
53 /* typecount[ 6] = 0 C_graph */
54 /* typecount[ 7] = 0 C_print_space_nonblank */
55 /* typecount[ 8] = 14 C_print_space_blank */
56 /* typecount[ 9] = 0 C_space_nonblank_noncntrl */
57 /* typecount[10] = 0 C_space_blank_noncntrl */
58 /* typecount[11] = 6 C_cntrl_space_nonblank */
59 /* typecount[12] = 1 C_cntrl_space_blank */
60 /* typecount[13] = 60 C_cntrl_nonspace */
61 /* typecount[14] = 96100 C_unclassified */
62 /* typecount[15] = 0 empty_slot */
66 /* Set to #if 0 to restrict wchars to 16 bits. */
68 #define RANGE 0x2ffffUL
70 #define RANGE 0x1ffffUL
72 #define RANGE 0xffffUL /* Restrict for 16-bit wchar_t... */
76 /* Classification codes. */
78 static const char *typename[] = {
80 "C_alpha_nonupper_nonlower",
82 "C_alpha_upper_lower",
87 "C_print_space_nonblank",
88 "C_print_space_blank",
89 "C_space_nonblank_noncntrl",
90 "C_space_blank_noncntrl",
91 "C_cntrl_space_nonblank",
92 "C_cntrl_space_blank",
99 /* Taking advantage of the C99 mutual-exclusion guarantees for the various
100 * (w)ctype classes, including the descriptions of printing and control
101 * (w)chars, we can place each in one of the following mutually-exlusive
102 * subsets. Since there are less than 16, we can store the data for
103 * each (w)chars in a nibble. In contrast, glibc uses an unsigned int
104 * per (w)char, with one bit flag for each is* type. While this allows
105 * a simple '&' operation to determine the type vs. a range test and a
106 * little special handling for the "blank" and "xdigit" types in my
107 * approach, it also uses 8 times the space for the tables on the typical
108 * 32-bit archs we supported.*/
110 __CTYPE_unclassified = 0,
111 __CTYPE_alpha_nonupper_nonlower,
113 __CTYPE_alpha_upper_lower,
118 __CTYPE_print_space_nonblank,
119 __CTYPE_print_space_blank,
120 __CTYPE_space_nonblank_noncntrl,
121 __CTYPE_space_blank_noncntrl,
122 __CTYPE_cntrl_space_nonblank,
123 __CTYPE_cntrl_space_blank,
124 __CTYPE_cntrl_nonspace,
128 #define __CTYPE_isxdigit(D,X) \
129 (__CTYPE_isdigit(D) || (((unsigned int)(((X)|0x20) - 'a')) <= 5))
131 #define mywalnum(x) __CTYPE_isalnum(d)
132 #define mywalpha(x) __CTYPE_isalpha(d)
133 #define mywblank(x) __CTYPE_isblank(d)
134 #define mywcntrl(x) __CTYPE_iscntrl(d)
135 #define mywdigit(x) __CTYPE_isdigit(d)
136 #define mywgraph(x) __CTYPE_isgraph(d)
137 #define mywlower(x) __CTYPE_islower(d)
138 #define mywprint(x) __CTYPE_isprint(d)
139 #define mywpunct(x) __CTYPE_ispunct(d)
140 #define mywspace(x) __CTYPE_isspace(d)
141 #define mywupper(x) __CTYPE_isupper(d)
142 #define mywxdigit(x) __CTYPE_isxdigit(d,x)
154 unsigned char ii_shift;
155 unsigned char ti_shift;
163 void output_table(FILE *fp, const char *name, table_data *tbl)
167 fprintf(fp, "#define __LOCALE_DATA_WC%s_II_LEN %7u\n", name, tbl->ii_len);
168 fprintf(fp, "#define __LOCALE_DATA_WC%s_TI_LEN %7u\n", name, tbl->ti_len);
169 fprintf(fp, "#define __LOCALE_DATA_WC%s_UT_LEN %7u\n", name, tbl->ut_len);
171 fprintf(fp, "#define __LOCALE_DATA_WC%s_II_SHIFT %7u\n", name, tbl->ii_shift);
172 fprintf(fp, "#define __LOCALE_DATA_WC%s_TI_SHIFT %7u\n", name, tbl->ti_shift);
174 fprintf(fp, "\n#ifdef WANT_WC%s_data\n", name);
176 i = tbl->ii_len + tbl->ti_len + tbl->ut_len;
177 fprintf(fp, "\nstatic const unsigned char __LOCALE_DATA_WC%s_data[%zu] = {", name, i);
178 for (i=0 ; i < tbl->ii_len ; i++) {
182 fprintf(fp, " %#04x,", tbl->ii[i]);
184 for (i=0 ; i < tbl->ti_len ; i++) {
188 fprintf(fp, " %#04x,", tbl->ti[i]);
190 for (i=0 ; i < tbl->ut_len ; i++) {
194 fprintf(fp, " %#04x,", tbl->ut[i]);
196 fprintf(fp, "\n};\n\n");
198 fprintf(fp, "#endif /* WANT_WC%s_data */\n\n", name);
201 static void dump_table_data(table_data *tbl)
203 printf("ii_shift = %d ti_shift = %d\n"
204 "ii_len = %d ti_len = %d ut_len = %d\n"
206 tbl->ii_shift, tbl->ti_shift,
207 tbl->ii_len, tbl->ti_len, tbl->ut_len,
208 (int) tbl->ii_len + (int) tbl->ti_len + (int) tbl->ut_len);
211 /* For sorting the blocks of unsigned chars. */
212 static size_t nu_val;
214 int nu_memcmp(const void *a, const void *b)
216 return memcmp(*(unsigned char**)a, *(unsigned char**)b, nu_val);
219 static size_t newopt(unsigned char *ut, size_t usize, int shift, table_data *tbl);
221 #define MAXTO 255 /* Restrict to minimal unsigned char max. */
223 int main(int argc, char **argv)
231 uldiff_entry uldiff[MAXTO];
235 table_data combtable;
236 table_data widthtable;
237 long int last_comb = 0;
239 unsigned char wct[(RANGE/2)+1]; /* wctype table (nibble per wchar) */
240 unsigned char ult[RANGE+1]; /* upper/lower table */
241 unsigned char combt[(RANGE/4)+1]; /* combining */
242 unsigned char widtht[(RANGE/4)+1]; /* width */
244 wctype_t is_comb, is_comb3;
246 long int typecount[16];
247 const char *typename[16];
248 static const char empty_slot[] = "empty_slot";
251 #define INIT_TYPENAME(X) typename[__CTYPE_##X] = "C_" #X
253 for (i=0 ; i < 16 ; i++) {
254 typename[i] = empty_slot;
257 INIT_TYPENAME(unclassified);
258 INIT_TYPENAME(alpha_nonupper_nonlower);
259 INIT_TYPENAME(alpha_lower);
260 INIT_TYPENAME(alpha_upper_lower);
261 INIT_TYPENAME(alpha_upper);
262 INIT_TYPENAME(digit);
263 INIT_TYPENAME(punct);
264 INIT_TYPENAME(graph);
265 INIT_TYPENAME(print_space_nonblank);
266 INIT_TYPENAME(print_space_blank);
267 INIT_TYPENAME(space_nonblank_noncntrl);
268 INIT_TYPENAME(space_blank_noncntrl);
269 INIT_TYPENAME(cntrl_space_nonblank);
270 INIT_TYPENAME(cntrl_space_blank);
271 INIT_TYPENAME(cntrl_nonspace);
273 memset(&cttable, 0, sizeof(table_data));
274 memset(&ultable, 0, sizeof(table_data));
276 memset(combtable, 0, sizeof table_data);
277 memset(widthtable, 0, sizeof table_data);
279 setvbuf(stdout, NULL, _IONBF, 0);
282 if (!setlocale(LC_CTYPE, *++argv)) {
283 printf("setlocale(LC_CTYPE,%s) failed! Skipping this locale...\n", *argv);
287 if (!(totitle = wctrans("totitle"))) {
288 printf("no totitle transformation.\n");
290 if (!(is_comb = wctype("combining"))) {
291 printf("no combining wctype.\n");
293 if (!(is_comb3 = wctype("combining_level3"))) {
294 printf("no combining_level3 wctype.\n");
300 uldiff[0].u = uldiff[0].l = 0;
302 memset(wct, 0, sizeof(wct));
303 memset(combt, 0, sizeof(combt));
304 memset(widtht, 0, sizeof(widtht));
306 for (i = 0 ; i < 16 ; i++) {
310 for (c=0 ; c <= RANGE ; c++) {
313 } else if (iswalpha(c)) {
314 d = __CTYPE_alpha_nonupper_nonlower;
316 d = __CTYPE_alpha_lower;
318 d = __CTYPE_alpha_upper_lower;
320 } else if (iswupper(c)) {
321 d = __CTYPE_alpha_upper;
323 } else if (iswpunct(c)) {
325 } else if (iswgraph(c)) {
327 } else if (iswprint(c)) {
328 d = __CTYPE_print_space_nonblank;
330 d = __CTYPE_print_space_blank;
332 } else if (iswspace(c) && !iswcntrl(c)) {
333 d = __CTYPE_space_nonblank_noncntrl;
335 d = __CTYPE_space_blank_noncntrl;
337 } else if (iswcntrl(c)) {
338 d = __CTYPE_cntrl_nonspace;
340 d = __CTYPE_cntrl_space_nonblank;
342 d = __CTYPE_cntrl_space_blank;
346 d = __CTYPE_unclassified;
354 printf("%#8x : space blank\n", c);
356 printf("%#8x : space\n", c);
366 if (isalnum(c)) ++glibc; glibc <<= 1;
367 if (isalpha(c)) ++glibc; glibc <<= 1;
368 if (isblank(c)) ++glibc; glibc <<= 1;
369 if (iscntrl(c)) ++glibc; glibc <<= 1;
370 if (isdigit(c)) ++glibc; glibc <<= 1;
371 if (isgraph(c)) ++glibc; glibc <<= 1;
372 if (islower(c)) ++glibc; glibc <<= 1;
373 if (isprint(c)) ++glibc; glibc <<= 1;
374 if (ispunct(c)) ++glibc; glibc <<= 1;
375 if (isspace(c)) ++glibc; glibc <<= 1;
376 if (isupper(c)) ++glibc; glibc <<= 1;
377 if (isxdigit(c)) ++glibc;
378 printf("%#8x : ctype %#4x\n", c, glibc);
383 /* Paranoid checking... */
389 if (iswalnum(c)) ++glibc; glibc <<= 1;
390 if (iswalpha(c)) ++glibc; glibc <<= 1;
391 if (iswblank(c)) ++glibc; glibc <<= 1;
392 if (iswcntrl(c)) ++glibc; glibc <<= 1;
393 if (iswdigit(c)) ++glibc; glibc <<= 1;
394 if (iswgraph(c)) ++glibc; glibc <<= 1;
395 if (iswlower(c)) ++glibc; glibc <<= 1;
396 if (iswprint(c)) ++glibc; glibc <<= 1;
397 if (iswpunct(c)) ++glibc; glibc <<= 1;
398 if (iswspace(c)) ++glibc; glibc <<= 1;
399 if (iswupper(c)) ++glibc; glibc <<= 1;
400 if (iswxdigit(c)) ++glibc;
403 if (mywalnum(c)) ++mine; mine <<= 1;
404 if (mywalpha(c)) ++mine; mine <<= 1;
405 if (mywblank(c)) ++mine; mine <<= 1;
406 if (mywcntrl(c)) ++mine; mine <<= 1;
407 if (mywdigit(c)) ++mine; mine <<= 1;
408 if (mywgraph(c)) ++mine; mine <<= 1;
409 if (mywlower(c)) ++mine; mine <<= 1;
410 if (mywprint(c)) ++mine; mine <<= 1;
411 if (mywpunct(c)) ++mine; mine <<= 1;
412 if (mywspace(c)) ++mine; mine <<= 1;
413 if (mywupper(c)) ++mine; mine <<= 1;
414 if (mywxdigit(c)) ++mine;
417 printf("%#8x : glibc %#4x != %#4x mine %u\n", c, glibc, mine, d);
422 if (iswctype(c,is_comb) || iswctype(c,is_comb3)) {
423 /* if (!iswpunct(c)) { */
424 printf("%#8x : %d %d %#4x\n",
425 c, iswctype(c,is_comb),iswctype(c,is_comb3), glibc);
430 if (iswctype(c,is_comb) || iswctype(c,is_comb3)) {
432 printf("%#8x - ", c);
434 } else if (last_comb + 1 < c) {
435 printf("%#8x\n%#8x - ", last_comb, c);
445 combt[c/4] |= ((((!!iswctype(c,is_comb)) << 1) | !!iswctype(c,is_comb3))
447 /* comb3t[c/8] |= ((!!iswctype(c,is_comb3)) << (c & 7)); */
449 /* widtht[c/4] |= (wcwidth(c) << ((c & 3) << 1)); */
451 if (c & 1) { /* Use the high nibble for odd numbered wchars. */
456 l = (long)(int) towlower(c) - c;
457 u = (long)(int) towupper(c) - c;
460 if ((l != (short)l) || (u != (short)u)) {
461 printf("range assumption error! %x %ld %ld\n", c, l, u);
464 for (i=0 ; i < ul_count ; i++) {
465 if ((l == uldiff[i].l) && (u == uldiff[i].u)) {
469 uldiff[ul_count].l = l;
470 uldiff[ul_count].u = u;
472 if (ul_count > MAXTO) {
473 printf("too many touppers/tolowers!\n");
481 for (i = 0 ; i < 16 ; i++) {
482 printf("typecount[%2d] = %8ld %s\n", i, typecount[i], typename[i]);
485 printf("optimizing is* table..\n");
489 for (i=0 ; i < 14 ; i++) {
490 t = newopt(wct, (RANGE/2)+1, i, &cttable);
498 printf("smallest = %zu\n", smallest);
499 if (!(cttable.ii = malloc(smallest))) {
500 printf("couldn't allocate space!\n");
504 newopt(wct, (RANGE/2)+1, n, &cttable);
505 ++cttable.ti_shift; /* correct for nibble mode */
509 printf("optimizing u/l-to table..\n");
512 for (i=0 ; i < 14 ; i++) {
513 t = newopt(ult, RANGE+1, i, &ultable);
521 printf("%zu (smallest) + %zu (u/l diffs) = %zu\n",
522 smallest, 4 * ul_count, smallest + 4 * ul_count);
523 printf("smallest = %zu\n", smallest);
524 if (!(ultable.ii = malloc(smallest))) {
525 printf("couldn't allocate space!\n");
529 newopt(ult, RANGE+1, n, &ultable);
533 printf("optimizing comb table..\n");
536 for (i=0 ; i < 14 ; i++) {
537 t = newopt(combt, sizeof(combt), i, &combtable);
545 printf("smallest = %zu\n", smallest);
546 if (!(combtable.ii = malloc(smallest))) {
547 printf("couldn't allocate space!\n");
551 newopt(combt, sizeof(combt), n, &combtable);
552 combtable.ti_shift += 4; /* correct for 4 entries per */
557 printf("optimizing width table..\n");
559 widthtable.ii = NULL;
560 for (i=0 ; i < 14 ; i++) {
561 t = newopt(widtht, sizeof(widtht), i, &widthtable);
569 printf("smallest = %zu\n", smallest);
570 if (!(widthtable.ii = malloc(smallest))) {
571 printf("couldn't allocate space!\n");
575 newopt(widtht, sizeof(widtht), n, &widthtable);
576 widthtable.ti_shift += 4; /* correct for 4 entries per */
580 printf("optimizing comb3 table..\n");
582 comb3table.ii = NULL;
583 for (i=0 ; i < 14 ; i++) {
584 t = newopt(comb3t, sizeof(comb3t), i, &comb3table);
592 printf("smallest = %zu\n", smallest);
593 if (!(comb3table.ii = malloc(smallest))) {
594 printf("couldn't allocate space!\n");
598 newopt(comb3t, sizeof(comb3t), n, &comb3table);
599 comb3table.ti_shift += 8; /* correct for 4 entries per */
602 dump_table_data(&cttable);
603 dump_table_data(&ultable);
605 dump_table_data(&combtable);
609 printf("verifying for %s...\n", *argv);
611 for (c=0 ; c <= 0xffffUL ; c++)
613 for (c=0 ; c <= 0x10ffffUL ; c++)
618 unsigned int upper, lower;
621 #if RANGE < 0x10000UL
622 if (c == 0x10000UL) {
623 c = 0x30000UL; /* skip 1st and 2nd sup planes */
625 #elif RANGE < 0x20000UL
626 if (c == 0x20000UL) {
627 c = 0x30000UL; /* skip 2nd sup planes */
633 if (iswalnum(c)) ++glibc; glibc <<= 1;
634 if (iswalpha(c)) ++glibc; glibc <<= 1;
635 if (iswblank(c)) ++glibc; glibc <<= 1;
636 if (iswcntrl(c)) ++glibc; glibc <<= 1;
637 if (iswdigit(c)) ++glibc; glibc <<= 1;
638 if (iswgraph(c)) ++glibc; glibc <<= 1;
639 if (iswlower(c)) ++glibc; glibc <<= 1;
640 if (iswprint(c)) ++glibc; glibc <<= 1;
641 if (iswpunct(c)) ++glibc; glibc <<= 1;
642 if (iswspace(c)) ++glibc; glibc <<= 1;
643 if (iswupper(c)) ++glibc; glibc <<= 1;
644 if (iswxdigit(c)) ++glibc;
653 sc = u & ((1 << cttable.ti_shift) - 1);
654 u >>= cttable.ti_shift;
655 n = u & ((1 << cttable.ii_shift) - 1);
656 u >>= cttable.ii_shift;
659 i0 <<= cttable.ii_shift;
660 i1 = cttable.ti[i0 + n];
661 i1 <<= (cttable.ti_shift-1);
662 d = cttable.ut[i1 + (sc >> 1)];
668 } else if ((((unsigned int)(c - 0xe0020UL)) <= 0x5f) || (c == 0xe0001UL)){
670 } else if (((unsigned int)(c - 0xf0000UL)) < 0x20000UL) {
671 if ((c & 0xffffU) <= 0xfffdU) {
674 d = __CTYPE_unclassified;
677 d = __CTYPE_unclassified;
681 if (mywalnum(c)) ++mine; mine <<= 1;
682 if (mywalpha(c)) ++mine; mine <<= 1;
683 if (mywblank(c)) ++mine; mine <<= 1;
684 if (mywcntrl(c)) ++mine; mine <<= 1;
685 if (mywdigit(c)) ++mine; mine <<= 1;
686 if (mywgraph(c)) ++mine; mine <<= 1;
687 if (mywlower(c)) ++mine; mine <<= 1;
688 if (mywprint(c)) ++mine; mine <<= 1;
689 if (mywpunct(c)) ++mine; mine <<= 1;
690 if (mywspace(c)) ++mine; mine <<= 1;
691 if (mywupper(c)) ++mine; mine <<= 1;
692 if (mywxdigit(c)) ++mine;
695 printf("%#8x : glibc %#4x != %#4x mine %d\n", c, glibc, mine, d);
697 printf("sc=%#x u=%#x n=%#x i0=%#x i1=%#x\n", sc, u, n, i0, i1);
700 upper = lower = u = c;
702 sc = u & ((1 << ultable.ti_shift) - 1);
703 u >>= ultable.ti_shift;
704 n = u & ((1 << ultable.ii_shift) - 1);
705 u >>= ultable.ii_shift;
708 i0 <<= ultable.ii_shift;
709 i1 = ultable.ti[i0 + n];
710 i1 <<= (ultable.ti_shift);
713 upper = c + uldiff[i0].u;
714 lower = c + uldiff[i0].l;
717 if (towupper(c) != upper) {
718 printf("%#8x : towupper glibc %#4x != %#4x mine\n",
719 c, towupper(c), upper);
722 if (towlower(c) != lower) {
723 printf("%#8x : towlower glibc %#4x != %#4x mine i0 = %d\n",
724 c, towlower(c), lower, i0);
727 if (totitle && ((tt = towctrans(c, totitle)) != upper)) {
728 printf("%#8x : totitle glibc %#4lx != %#4x mine i0 = %d\n",
734 if ((c & 0xfff) == 0xfff) printf(".");
742 if (!(fp = fopen("wctables.h", "w"))) {
743 printf("cannot open output file 'wctables.h'!\n");
747 fprintf(fp, "#define __LOCALE_DATA_WC_TABLE_DOMAIN_MAX %#8lx\n\n",
748 (unsigned long) RANGE);
749 output_table(fp, "ctype", &cttable);
750 output_table(fp, "uplow", &ultable);
753 #warning fix the upper bound on the upper/lower tables... save 200 bytes or so
754 fprintf(fp, "#define __LOCALE_DATA_WCuplow_diffs %7u\n", ul_count);
755 fprintf(fp, "\n#ifdef WANT_WCuplow_diff_data\n\n");
756 fprintf(fp, "\nstatic const short __LOCALE_DATA_WCuplow_diff_data[%zu] = {",
757 2 * (size_t) ul_count);
758 for (i=0 ; i < ul_count ; i++) {
762 fprintf(fp, " %6d, %6d,", uldiff[i].u, uldiff[i].l);
764 fprintf(fp, "\n};\n\n");
765 fprintf(fp, "#endif /* WANT_WCuplow_diff_data */\n\n");
768 /* output_table(fp, "comb", &combtable); */
769 /* output_table(fp, "width", &widthtable); */
777 size_t newopt(unsigned char *ut, size_t usize, int shift, table_data *tbl)
780 unsigned char *ti[RANGE+1]; /* table index */
786 unsigned char *ii_save;
788 unsigned char uit[RANGE+1];
791 memset(uniqblock, 0x00, sizeof(uniqblock));
794 blocksize = 1 << shift;
795 numblocks = usize >> shift;
797 /* init table index */
798 for (i=j=0 ; i < numblocks ; i++) {
805 qsort(ti, numblocks, sizeof(unsigned char *), nu_memcmp);
808 uit[(ti[0]-ut)/blocksize] = 0;
809 for (i=1 ; i < numblocks ; i++) {
810 if (memcmp(ti[i-1], ti[i], blocksize) < 0) {
814 uniqblock[uniq - 1] = i;
817 else if (memcmp(ti[i-1], ti[i], blocksize) > 0) {
818 printf("bad sort %i!\n", i);
822 uit[(ti[i]-ut)/blocksize] = uniq - 1;
828 smallest = numblocks + uniq * blocksize;
831 for (j=1 ; j < 14 ; j++) {
832 if ((numblocks >> j) < 2) break;
837 if ((t = newopt(uit, numblocks, j, tbl)) < SIZE_MAX) {
838 t += uniq * blocksize;
847 printf("ishift %zu tshift %zu size %zu\n",
862 tbl->ii_shift = shift;
863 tbl->ii_len = numblocks;
864 memcpy(tbl->ii, uit, numblocks);
865 tbl->ti = tbl->ii + tbl->ii_len;
866 tbl->ti_len = uniq * blocksize;
867 for (i=0 ; i < uniq ; i++) {
868 memcpy(tbl->ti + i * blocksize, ti[uniqblock[i]], blocksize);
872 printf("setting ishift %zu tshift %zu\n",
874 newopt(uit, numblocks, shift2, tbl);
876 tbl->ti_shift = shift;
877 tbl->ut_len = uniq * blocksize;
878 tbl->ut = tbl->ti + tbl->ti_len;
879 for (i=0 ; i < uniq ; i++) {
880 memcpy(tbl->ut + i * blocksize, ti[uniqblock[i]], blocksize);
886 /* vi: set sw=4 ts=4: */