10 /* #define CTYPE_PACKED */
11 #define UPLOW_IDX_SHIFT 3
12 /* best if 2 unpacked or 3 packed */
13 #define CTYPE_IDX_SHIFT 3
14 /* 3 or 4 are very similar */
15 #define C2WC_IDX_SHIFT 3
17 #define CTYPE_IDX_LEN (128 >> (CTYPE_IDX_SHIFT))
18 #define UPLOW_IDX_LEN (128 >> (UPLOW_IDX_SHIFT))
19 #define C2WC_IDX_LEN (128 >> (C2WC_IDX_SHIFT))
21 /* #ifdef CTYPE_PACKED */
22 /* #define CTYPE_ROW_LEN (1 << ((CTYPE_IDX_SHIFT)-1)) */
24 #define CTYPE_ROW_LEN (1 << (CTYPE_IDX_SHIFT))
26 #define UPLOW_ROW_LEN (1 << (UPLOW_IDX_SHIFT))
27 #define C2WC_ROW_LEN (1 << (C2WC_IDX_SHIFT))
31 #define MAX_WCHAR (0x2600-1)
33 static unsigned char ctype_tbl[256 * CTYPE_ROW_LEN];
34 static unsigned char uplow_tbl[256 * UPLOW_ROW_LEN];
36 static unsigned short c2wc_tbl[256 * C2WC_ROW_LEN];
38 static unsigned char tt[MAX_WCHAR+1];
39 static unsigned char ti[MAX_WCHAR+1];
40 static unsigned char xi[MAX_WCHAR+1];
42 static int n_ctype_rows;
43 static int n_uplow_rows;
45 static int n_c2wc_rows;
50 #define RANGE MAX_WCHAR
55 #define II_LEN ((MAX_WCHAR+1) >> (TT_SHIFT+TI_SHIFT))
58 unsigned long c2w[256];
59 unsigned char w2c[MAX_WCHAR];
60 unsigned char ii[II_LEN];
61 unsigned char ctype_idx[CTYPE_IDX_LEN];
62 unsigned char uplow_idx[UPLOW_IDX_LEN];
63 unsigned char c2wc_idx[C2WC_IDX_LEN];
66 /* Taking advantage of the C99 mutual-exclusion guarantees for the various
67 * (w)ctype classes, including the descriptions of printing and control
68 * (w)chars, we can place each in one of the following mutually-exlusive
69 * subsets. Since there are less than 16, we can store the data for
70 * each (w)chars in a nibble. In contrast, glibc uses an unsigned int
71 * per (w)char, with one bit flag for each is* type. While this allows
72 * a simple '&' operation to determine the type vs. a range test and a
73 * little special handling for the "blank" and "xdigit" types in my
74 * approach, it also uses 8 times the space for the tables on the typical
75 * 32-bit archs we supported.*/
77 __CTYPE_unclassified = 0,
78 __CTYPE_alpha_nonupper_nonlower,
80 __CTYPE_alpha_upper_lower,
85 __CTYPE_print_space_nonblank,
86 __CTYPE_print_space_blank,
87 __CTYPE_space_nonblank_noncntrl,
88 __CTYPE_space_blank_noncntrl,
89 __CTYPE_cntrl_space_nonblank,
90 __CTYPE_cntrl_space_blank,
91 __CTYPE_cntrl_nonspace,
94 int main(int argc, char **argv)
99 unsigned long max_wchar;
105 unsigned char row[256];
107 unsigned short wrow[256];
109 char codeset_list[500];
110 char codeset_index[30];
111 int codeset_list_end = 0;
114 if (!setlocale(LC_CTYPE, "en_US.UTF-8")) {
115 printf("setlocale(LC_CTYPE,\"en_US.UTF-8\") failed!\n");
119 if (!(out = fopen("c8tables.h","w"))) {
120 printf("error: couldn't open file \"c8tables.h\"\n");
126 /* User requested 8-bit codesets, but didn't list any... */
127 /* Allow to build, just so this feature can be left on in config. */
128 fprintf(out, "#ifdef __CTYPE_HAS_8_BIT_LOCALES\n");
129 fprintf(out, "#warning ignoring 8 bit codesets request"
130 " as no codesets specified.\n");
131 fprintf(out, "#endif\n");
132 fprintf(out, "#undef __CTYPE_HAS_8_BIT_LOCALES\n\n");
134 fprintf(out, "#define NUM_CODESETS\t\t0\n");
135 fprintf(out, "#define CODESET_LIST\t\t\"\"\n");
140 /* fprintf(out, "#define __CTYPE_HAS_8_BIT_LOCALES\t1\n\n"); */
141 fprintf(out, "#ifdef __CTYPE_HAS_8_BIT_LOCALES\n\n");
145 fprintf(out, "#undef __CTYPE_HAS_8_BIT_LOCALES\n\n");
147 fprintf(out, "#define NUM_CODESETS\t\t0\n");
148 fprintf(out, "#define CODESET_LIST\t\t\"\"\n");
150 fprintf(out, "#define __CTYPE_HAS_8_BIT_LOCALES\t\t1\n\n");
153 fprintf(out, "#define Cctype_IDX_SHIFT\t%d\n", CTYPE_IDX_SHIFT);
154 fprintf(out, "#define Cctype_IDX_LEN\t\t%d\n", CTYPE_IDX_LEN);
156 fprintf(out, "#define Cctype_ROW_LEN\t\t%d\n", CTYPE_ROW_LEN >> 1);
157 fprintf(out, "#define Cctype_PACKED\t\t1\n");
159 fprintf(out, "#define Cctype_ROW_LEN\t\t%d\n", CTYPE_ROW_LEN);
160 fprintf(out, "#undef Cctype_PACKED\n");
163 fprintf(out, "\n#define Cuplow_IDX_SHIFT\t%d\n", UPLOW_IDX_SHIFT);
164 fprintf(out, "#define Cuplow_IDX_LEN\t\t%d\n", UPLOW_IDX_LEN);
165 fprintf(out, "#define Cuplow_ROW_LEN\t\t%d\n", UPLOW_ROW_LEN);
168 fprintf(out, "\n#define Cc2wc_IDX_LEN\t\t%d\n", C2WC_IDX_LEN);
169 fprintf(out, "#define Cc2wc_IDX_SHIFT\t\t%d\n", C2WC_IDX_SHIFT);
170 fprintf(out, "#define Cc2wc_ROW_LEN\t\t%d\n", C2WC_ROW_LEN);
173 fprintf(out, "\ntypedef struct {\n");
174 fprintf(out, "\tunsigned char idx8ctype[%d];\n", CTYPE_IDX_LEN);
175 fprintf(out, "\tunsigned char idx8uplow[%d];\n", UPLOW_IDX_LEN);
177 fprintf(out, "\tunsigned char idx8c2wc[%d];\n", C2WC_IDX_LEN);
178 fprintf(out, "\tunsigned char idx8wc2c[%d];\n", II_LEN);
180 fprintf(out, "} codeset_8_bit_t;\n\n");
182 fprintf(out, "#ifdef WANT_DATA\n\n");
183 fprintf(out, "static const codeset_8_bit_t codeset_8_bit[%d] = {\n", argc-1);
187 codeset_index[0] = 0;
189 if (!(fp = fopen(*++argv,"r"))) {
190 printf("error: couldn't open file \"%s\"\n", *argv);
193 printf("processing %s... ", *argv);
200 s0 = strrchr(*argv, '/');
206 s1 = strchr(s0, '.');
213 /* if ((numsets == 0) && strncmp("ASCII", s0, n)) { */
214 /* printf("error - first codeset isn't ASCII!\n"); */
215 /* return EXIT_FAILURE; */
218 if (numsets >= sizeof(codeset_index)) {
219 printf("error - too many codesets!\n");
223 if (codeset_list_end + n + 1 + numsets + 1 + 1 >= 256) {
224 printf("error - codeset list to big!\n");
228 codeset_index[numsets+1] = codeset_index[numsets] + n+1;
229 strncpy(codeset_list + codeset_list_end, s0, n);
230 codeset_list_end += (n+1);
231 codeset_list[codeset_list_end - 1] = 0;
233 fprintf(out, "\t{ /* %.*s */", n, s0);
236 memset(&csd[numsets],sizeof(charset_data),0);
237 memset(xi, sizeof(xi), 0);
242 while (fgets(buf,sizeof(buf),fp)) {
243 if ((2 != sscanf(buf, "{ %lx , %lx", &c, &wc))
244 || (c >= 256) || (wc > MAX_WCHAR)) {
245 printf("error: scanf failure! \"%s\"\n", buf);
249 /* don't put in w2c... dynamicly build tt instead. */
251 if (c <= 0x7f) { /* check the 7bit entries but don't store */
253 printf("error: c != wc in %s\n", buf);
256 csd[numsets].c2w[c] = wc;
257 csd[numsets].w2c[wc] = 0; /* ignore */
258 if (wc > max_wchar) {
262 csd[numsets].c2w[c] = wc;
263 csd[numsets].w2c[wc] = c;
264 if (wc > max_wchar) {
270 printf("%d lines ", lines);
272 for (i = 0 ; i <= MAX_WCHAR ; i += (1 << TT_SHIFT)) {
273 p = &csd[numsets].w2c[i];
274 for (j = 0 ; j < tt_num ; j++) {
275 if (!memcmp(p, &tt[j << TT_SHIFT], (1 << TT_SHIFT))) {
279 if (j == tt_num) { /* new entry */
280 memcpy(&tt[j << TT_SHIFT], p, (1 << TT_SHIFT));
283 xi[i >> TT_SHIFT] = j;
286 for (i = 0 ; i <= (MAX_WCHAR >> TT_SHIFT) ; i += (1 << TI_SHIFT)) {
288 for (j = 0 ; j < ti_num ; j++) {
289 if (!memcmp(p, &ti[j << TI_SHIFT], (1 << TI_SHIFT))) {
293 if (j == ti_num) { /* new entry */
294 memcpy(&ti[j << TI_SHIFT], p, (1 << TI_SHIFT));
297 csd[numsets].ii[i >> TI_SHIFT] = j;
298 /* printf("%d ", i >> TI_SHIFT); */
302 fprintf(out, "\n\t\t/* idx8ctype data */\n\t\t{");
303 for (i = 128 ; i < 256 ; i++) {
307 /* if (!(i & 0x7)) { */
308 /* fprintf(out, "\n"); */
311 c = csd[numsets].c2w[i];
313 if (c == 0) { /* non-existant char in codeset */
314 d = __CTYPE_unclassified;
315 } else if (iswdigit(c)) {
317 } else if (iswalpha(c)) {
318 d = __CTYPE_alpha_nonupper_nonlower;
320 d = __CTYPE_alpha_lower;
322 d = __CTYPE_alpha_upper_lower;
324 } else if (iswupper(c)) {
325 d = __CTYPE_alpha_upper;
327 } else if (iswpunct(c)) {
329 } else if (iswgraph(c)) {
331 } else if (iswprint(c)) {
332 d = __CTYPE_print_space_nonblank;
334 d = __CTYPE_print_space_blank;
336 } else if (iswspace(c) && !iswcntrl(c)) {
337 d = __CTYPE_space_nonblank_noncntrl;
339 d = __CTYPE_space_blank_noncntrl;
341 } else if (iswcntrl(c)) {
342 d = __CTYPE_cntrl_nonspace;
344 d = __CTYPE_cntrl_space_nonblank;
346 d = __CTYPE_cntrl_space_blank;
350 d = __CTYPE_unclassified;
354 row[i & (CTYPE_ROW_LEN-1)] = d;
355 if ((i & (CTYPE_ROW_LEN-1)) == (CTYPE_ROW_LEN-1)) {
357 for (j=0 ; j < n_ctype_rows ; j++) {
358 if (!memcmp(p, row, CTYPE_ROW_LEN)) {
363 if (j == n_ctype_rows) { /* new entry */
364 if (++n_ctype_rows > 256) {
365 printf("error -- to many ctype rows!\n");
368 memcpy(p, row, CTYPE_ROW_LEN);
370 csd[numsets].ctype_idx[i >> CTYPE_IDX_SHIFT] = j;
371 if (!((i >> CTYPE_IDX_SHIFT) & 0x7)
372 && (i != (127 + CTYPE_ROW_LEN))
374 fprintf(out, "\n\t\t ");
376 fprintf(out, " %#4x,", j);
379 fprintf(out, " %#4x,", d);
386 fprintf(out, ",\n\t\t/* idx8uplow data */\n\t\t{");
387 for (i = 128 ; i < 256 ; i++) {
389 /* if (!(i & 0x7)) { */
390 /* fprintf(out, "\n"); */
392 c = csd[numsets].c2w[i];
397 if (u >= 0x80) u = csd[numsets].w2c[u];
398 if (l >= 0x80) l = csd[numsets].w2c[l];
400 if (u == 0) u = i; /* upper is missing, so ignore */
401 if (l == 0) l = i; /* lower is missing, so ignore */
404 /* store as unsigned char and let overflow handle it. */
405 /* if ((((u-i) < CHAR_MIN) || ((u-i) > CHAR_MAX)) */
406 /* || (((i-l) < CHAR_MIN) || ((i-l) > CHAR_MAX)) */
408 /* printf("error - uplow diff out of range! %d %ld %ld\n", */
410 /* return EXIT_FAILURE; */
413 row[i & (UPLOW_ROW_LEN-1)] = ((l==i) ? (u-i) : (i-l));
414 if ((i & (UPLOW_ROW_LEN-1)) == (UPLOW_ROW_LEN-1)) {
416 for (j=0 ; j < n_uplow_rows ; j++) {
417 if (!memcmp(p, row, UPLOW_ROW_LEN)) {
422 if (j == n_uplow_rows) { /* new entry */
423 if (++n_uplow_rows > 256) {
424 printf("error -- to many uplow rows!\n");
427 memcpy(p, row, UPLOW_ROW_LEN);
429 csd[numsets].uplow_idx[i >> UPLOW_IDX_SHIFT] = j;
430 if (!((i >> UPLOW_IDX_SHIFT) & 0x7)
431 && (i != (127 + UPLOW_ROW_LEN))
433 fprintf(out, "\n\t\t ");
435 fprintf(out, " %#4x,", j);
439 if (!(i & 0x7) && i) {
442 fprintf(out, " %4ld,", (l==i) ? (u-i) : (i-l));
443 /* fprintf(out, " %4ld,", (l==i) ? u : l); */
445 if ((u != i) || (l != i)) {
447 fprintf(out, " %#08lx, %#08lx, %#08lx, %#08lx, %#08lx, %#08lx, \n",
451 (unsigned long) towlower(c),
453 (unsigned long) towupper(c));
456 fprintf(out, " %#08lx, %8ld, %d, %8ld, %d, %#08lx\n",
473 #else /* DO_WIDE_CHAR */
476 fprintf(out, ",\n\t\t/* idx8c2wc data */\n\t\t{");
477 for (i = 128 ; i < 256 ; i++) {
479 wrow[i & (C2WC_ROW_LEN-1)] = csd[numsets].c2w[i];
480 if ((i & (C2WC_ROW_LEN-1)) == (C2WC_ROW_LEN-1)) {
481 p = (char *) c2wc_tbl;
482 for (j=0 ; j < n_c2wc_rows ; j++) {
483 if (!memcmp(p, (char *) wrow, 2*C2WC_ROW_LEN)) {
488 if (j == n_c2wc_rows) { /* new entry */
489 if (++n_c2wc_rows > 256) {
490 printf("error -- to many c2wc rows!\n");
493 memcpy(p, (char *) wrow, 2*C2WC_ROW_LEN);
495 csd[numsets].c2wc_idx[i >> C2WC_IDX_SHIFT] = j;
496 if (!((i >> C2WC_IDX_SHIFT) & 0x7)
497 && (i != (127 + C2WC_ROW_LEN))
499 fprintf(out, "\n\t\t ");
501 fprintf(out, " %#4x,", j);
504 if (!(i & 0x7) && i) {
507 fprintf(out, " %#6lx,", csd[numsets].c2w[i]);
510 fprintf(out, " },\n");
514 /* fprintf(out, "\nII_LEN = %d\n", II_LEN); */
515 fprintf(out, "\t\t/* idx8wc2c data */\n\t\t{");
516 for (i = 0 ; i < II_LEN ; i++) {
517 if (!(i & 0x7) && i) {
518 fprintf(out, "\n\t\t ");
520 fprintf(out, " %#4x,", csd[numsets].ii[i]);
522 fprintf(out, " }\n");
525 #endif /* DO_WIDE_CHAR */
526 fprintf(out, "\t},\n");
532 fprintf(out, "};\n");
533 fprintf(out, "\n#endif /* WANT_DATA */\n");
537 fprintf(out, "#define Cwc2c_DOMAIN_MAX\t%#x\n", RANGE);
538 fprintf(out, "#define Cwc2c_TI_SHIFT\t\t%d\n", TI_SHIFT);
539 fprintf(out, "#define Cwc2c_TT_SHIFT\t\t%d\n", TT_SHIFT);
540 fprintf(out, "#define Cwc2c_II_LEN\t\t%d\n", II_LEN);
541 fprintf(out, "#define Cwc2c_TI_LEN\t\t%d\n", ti_num << TI_SHIFT);
542 fprintf(out, "#define Cwc2c_TT_LEN\t\t%d\n", tt_num << TT_SHIFT);
545 fprintf(out, "\n#define Cwc2c_TBL_LEN\t\t%d\n",
546 (ti_num << TI_SHIFT) + (tt_num << TT_SHIFT));
548 fprintf(out, "#ifdef WANT_DATA\n\n");
549 fprintf(out, "static const unsigned char Cwc2c_data[%d] = {\n",
550 (ti_num << TI_SHIFT) + (tt_num << TT_SHIFT));
551 fprintf(out, "\t/* ti_table */\n\t");
552 for (i=0 ; i < ti_num << TI_SHIFT ; i++) {
554 fprintf(out, "\n\t");
556 fprintf(out, " %#4x,", ti[i]);
559 fprintf(out, "\t/* tt_table */\n\t");
560 for (i=0 ; i < tt_num << TT_SHIFT ; i++) {
562 fprintf(out, "\n\t");
564 fprintf(out, " %#4x,", tt[i]);
566 fprintf(out, "\n};\n");
568 fprintf(out, "\n#endif /* WANT_DATA */\n");
569 #endif /* DO_WIDE_CHAR */
571 fprintf(out, "\n#define Cuplow_TBL_LEN\t\t%d\n",
572 n_uplow_rows * UPLOW_ROW_LEN);
573 fprintf(out, "\n#ifdef WANT_DATA\n\n");
575 fprintf(out, "\nstatic const unsigned char Cuplow_data[%d] = {\n",
576 n_uplow_rows * UPLOW_ROW_LEN);
578 for (j=0 ; j < n_uplow_rows ; j++) {
580 for (i=0 ; i < UPLOW_ROW_LEN ; i++) {
581 fprintf(out, " %#4x,", (unsigned int)((unsigned char) p[i]));
586 fprintf(out, "};\n");
588 fprintf(out, "\n#endif /* WANT_DATA */\n");
589 fprintf(out, "\n#define Cctype_TBL_LEN\t\t%d\n",
591 n_ctype_rows * CTYPE_ROW_LEN / 2
593 n_ctype_rows * CTYPE_ROW_LEN
596 fprintf(out, "\n#ifdef WANT_DATA\n\n");
599 fprintf(out, "\nstatic const unsigned char Cctype_data[%d] = {\n",
601 n_ctype_rows * CTYPE_ROW_LEN / 2
603 n_ctype_rows * CTYPE_ROW_LEN
607 for (j=0 ; j < n_ctype_rows ; j++) {
609 for (i=0 ; i < CTYPE_ROW_LEN ; i++) {
611 fprintf(out, " %#4x,", (unsigned int)(p[i] + (p[i+1] << 4)));
614 fprintf(out, " %#4x,", (unsigned int)p[i]);
620 fprintf(out, "};\n");
622 fprintf(out, "\n#endif /* WANT_DATA */\n");
626 fprintf(out, "\n#define Cc2wc_TBL_LEN\t\t%d\n",
627 n_c2wc_rows * C2WC_ROW_LEN);
628 fprintf(out, "\n#ifdef WANT_DATA\n\n");
630 fprintf(out, "\nstatic const unsigned short Cc2wc_data[%d] = {\n",
631 n_c2wc_rows * C2WC_ROW_LEN);
632 p = (char *) c2wc_tbl;
633 for (j=0 ; j < n_c2wc_rows ; j++) {
635 for (i=0 ; i < C2WC_ROW_LEN ; i++) {
636 fprintf(out, " %#6x,", (unsigned int)(((unsigned short *)p)[i]));
641 fprintf(out, "};\n");
642 fprintf(out, "\n#endif /* WANT_DATA */\n");
643 #endif /* DO_WIDE_CHAR */
644 fprintf(out, "\n\n");
646 fprintf(out, "#define NUM_CODESETS\t\t%d\n", numsets);
647 fprintf(out, "#define CODESET_LIST \\\n\t\"");
648 for (i=0 ; i < numsets ; i++) {
649 fprintf(out, "\\x%02x", numsets + 1 + (unsigned char) codeset_index[i]);
650 if (((i & 7) == 7) && (i + 1 < numsets)) {
651 fprintf(out, "\" \\\n\t\"");
654 fprintf(out, "\" \\\n\t\"\\0\"");
655 for (i=0 ; i < numsets ; i++) {
656 fprintf(out, " \\\n\t\"%s\\0\"",
657 codeset_list + ((unsigned char)codeset_index[i]));
660 fprintf(out, "\n\n");
661 for (i=0 ; i < numsets ; i++) {
664 strcpy(buf, codeset_list + ((unsigned char)codeset_index[i]));
665 for (z=buf ; *z ; z++) {
670 fprintf(out, "#define __CTYPE_HAS_CODESET_%s\n", buf);
673 fprintf(out, "#define __CTYPE_HAS_CODESET_UTF_8\n");
674 #endif /* DO_WIDE_CHAR */
677 fprintf(out, "\n#endif /* __CTYPE_HAS_8_BIT_LOCALES */\n\n");
684 printf("tt_num = %d ti_num = %d\n", tt_num, ti_num);
685 printf("max_wchar = %#lx\n", max_wchar);
687 printf("size is %d * %d + %d * %d + %d * %d = %d\n",
688 tt_num, 1 << TT_SHIFT, ti_num, 1 << TI_SHIFT,
689 ((MAX_WCHAR >> (TT_SHIFT + TI_SHIFT)) + 1), numsets,
690 j = tt_num * (1 << TT_SHIFT) + ti_num * (1 << TI_SHIFT)
691 + ((MAX_WCHAR >> (TT_SHIFT + TI_SHIFT)) + 1) * numsets);
693 #endif /* DO_WIDE_CHAR */
701 printf("ctype - CTYPE_IDX_SHIFT = %d -- %d * %d + %d * %d = %d\n",
702 CTYPE_IDX_SHIFT, numsets, CTYPE_IDX_LEN, n_ctype_rows, CTYPE_ROW_LEN / i,
703 j = numsets * CTYPE_IDX_LEN + n_ctype_rows * CTYPE_ROW_LEN / i);
706 printf("uplow - UPLOW_IDX_SHIFT = %d -- %d * %d + %d * %d = %d\n",
707 UPLOW_IDX_SHIFT, numsets, UPLOW_IDX_LEN, n_uplow_rows, UPLOW_ROW_LEN,
708 j = numsets * UPLOW_IDX_LEN + n_uplow_rows * UPLOW_ROW_LEN);
713 printf("c2wc - C2WC_IDX_SHIFT = %d -- %d * %d + 2 * %d * %d = %d\n",
714 C2WC_IDX_SHIFT, numsets, C2WC_IDX_LEN, n_c2wc_rows, C2WC_ROW_LEN,
715 j = numsets * C2WC_IDX_LEN + 2 * n_c2wc_rows * C2WC_ROW_LEN);
718 #endif /* DO_WIDE_CHAR */
720 printf("total size = %d\n", total_size);
722 /* for (i=0 ; i < numsets ; i++) { */
723 /* printf("codeset_index[i] = %d codeset_list[ci[i]] = \"%s\"\n", */
724 /* (unsigned char) codeset_index[i], */
725 /* codeset_list + ((unsigned char)codeset_index[i])); */