2 /* Copyright (C) 2002 Manuel Novoa III
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the Free
16 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 /* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION!
21 * Besides uClibc, I'm using this code in my libc for elks, which is
22 * a 16-bit environment with a fairly limited compiler. It would make
23 * things much easier for me if this file isn't modified unnecessarily.
24 * In particular, please put any new or replacement functions somewhere
25 * else, and modify the makefile to use your version instead.
28 * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */
31 /* May 23, 2002 Initial Notes:
33 * I'm still tweaking this stuff, but it passes the tests I've thrown
34 * at it, and Erik needs it for the gcc port. The glibc extension
35 * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
36 * in the glibc source. I also need to fix the behavior of
37 * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
39 * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
40 * file on my platform (x86) show about 5-10% faster conversion speed than
41 * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
42 * individual mbrtowc()/wcrtomb() calls.
44 * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
45 * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which
46 * needs to deal gracefully with whatever is sent to it. In that mode,
47 * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add
48 * an arg to force that behavior, so the interface will be changing.
50 * I need to fix the error checking for 16-bit wide chars. This isn't
51 * an issue for uClibc, but may be for ELKS. I'm currently not sure
52 * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
56 * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
57 * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
59 * Enabled building of a C/POSIX-locale-only version, so full locale support
60 * no longer needs to be enabled.
64 * Fixed a bug in _wchar_wcsntoutf8s(). Don't store wcs position if dst is NULL.
65 * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
66 * order to support %ls in printf. See comments below for details.
67 * Change behaviour of wc<->mb functions when in the C locale. Now they do
68 * a 1-1 map for the range 0x80-UCHAR_MAX. This is for backwards compatibility
69 * and consistency with the stds requirements that a printf format string by
70 * a valid multibyte string beginning and ending in it's initial shift state.
74 * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
78 * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
79 * Added some size/speed optimizations and integrated it into my locale
80 * framework. Minimally tested at the moment, but the stub C-locale
81 * version (which most people would probably be using) should be fine.
85 * Revert the wc<->mb changes from earlier this month involving the C-locale.
86 * Add a couple of ugly hacks to support *wprintf.
87 * Add a mini iconv() and iconv implementation (requires locale support).
93 #define _ISOC99_SOURCE
105 #ifdef __UCLIBC_HAS_LOCALE__
106 #define ENCODING (__global_locale.encoding)
107 #ifndef __CTYPE_HAS_UTF_8_LOCALES
108 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
111 #define ENCODING (__ctype_encoding_7_bit)
112 #ifdef __CTYPE_HAS_8_BIT_LOCALES
113 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
115 #ifdef __CTYPE_HAS_UTF_8_LOCALES
116 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
118 #undef L__wchar_utf8sntowcs
119 #undef L__wchar_wcsntoutf8s
122 #if WCHAR_MAX > 0xffffUL
123 #define UTF_8_MAX_LEN 6
125 #define UTF_8_MAX_LEN 3
130 /* Implementation-specific work functions. */
132 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
133 const char **__restrict src, size_t n,
134 mbstate_t *ps, int allow_continuation);
136 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
137 const wchar_t **__restrict src, size_t wn);
139 /* glibc extensions. */
141 extern size_t __mbsnrtowcs(wchar_t *__restrict dst,
142 const char **__restrict src,
143 size_t NMC, size_t len, mbstate_t *__restrict ps);
145 extern size_t __wcsnrtombs(char *__restrict dst,
146 const wchar_t **__restrict src,
147 size_t NWC, size_t len, mbstate_t *__restrict ps);
149 /**********************************************************************/
154 #ifdef __CTYPE_HAS_8_BIT_LOCALES
157 unsigned char buf[1];
161 *buf = (unsigned char) c;
162 mbstate.mask = 0; /* Initialize the mbstate. */
163 if (mbrtowc(&wc, buf, 1, &mbstate) <= 1) {
169 #else /* __CTYPE_HAS_8_BIT_LOCALES */
171 #ifdef __UCLIBC_HAS_LOCALE__
172 assert((ENCODING == __ctype_encoding_7_bit)
173 || (ENCODING == __ctype_encoding_utf8));
174 #endif /* __UCLIBC_HAS_LOCALE__ */
176 /* If we don't have 8-bit locale support, then this is trivial since
177 * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
178 return (((unsigned int)c) < 0x80) ? c : WEOF;
180 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
184 /**********************************************************************/
187 /* Note: We completely ignore ps in all currently supported conversions. */
191 #ifdef __CTYPE_HAS_8_BIT_LOCALES
193 unsigned char buf[MB_LEN_MAX];
195 return (wcrtomb(buf, c, NULL) == 1) ? *buf : EOF;
197 #else /* __CTYPE_HAS_8_BIT_LOCALES */
199 #ifdef __UCLIBC_HAS_LOCALE__
200 assert((ENCODING == __ctype_encoding_7_bit)
201 || (ENCODING == __ctype_encoding_utf8));
202 #endif /* __UCLIBC_HAS_LOCALE__ */
204 /* If we don't have 8-bit locale support, then this is trivial since
205 * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
207 /* TODO: need unsigned version of wint_t... */
208 /* return (((unsigned int)c) < 0x80) ? c : WEOF; */
209 return ((c >= 0) && (c < 0x80)) ? c : EOF;
211 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
215 /**********************************************************************/
218 int mbsinit(const mbstate_t *ps)
220 return !ps || !ps->mask;
224 /**********************************************************************/
227 size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
228 __attribute__ ((__weak__, __alias__("__mbrlen")));
230 size_t __mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
232 static mbstate_t mbstate; /* Rely on bss 0-init. */
234 return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
238 /**********************************************************************/
241 size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
242 size_t n, mbstate_t *__restrict ps)
244 static mbstate_t mbstate; /* Rely on bss 0-init. */
248 char empty_string[1]; /* Avoid static to be fPIC friendly. */
255 pwc = (wchar_t *) s; /* NULL */
256 empty_string[0] = 0; /* Init the empty string when necessary. */
260 return (ps->mask && (ps->wc == 0xffffU)) /* TODO: change error code? */
261 ? ((size_t) -1) : ((size_t) -2);
266 #ifdef __CTYPE_HAS_UTF_8_LOCALES
267 /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
268 if (ENCODING == __ctype_encoding_utf8) {
269 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
270 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
274 r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
276 if (((ssize_t) r) >= 0) {
285 /**********************************************************************/
288 /* Note: We completely ignore ps in all currently supported conversions. */
289 /* TODO: Check for valid state anyway? */
291 size_t wcrtomb(register char *__restrict s, wchar_t wc,
292 mbstate_t *__restrict ps)
297 char buf[MB_LEN_MAX];
308 r = __wcsnrtombs(s, &pwc, SIZE_MAX, MB_LEN_MAX, ps);
309 return (r != 0) ? r : 1;
313 /**********************************************************************/
316 size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
317 size_t len, mbstate_t *__restrict ps)
319 static mbstate_t mbstate; /* Rely on bss 0-init. */
321 return __mbsnrtowcs(dst, src, SIZE_MAX, len,
322 ((ps != NULL) ? ps : &mbstate));
326 /**********************************************************************/
329 /* Note: We completely ignore ps in all currently supported conversions.
331 * TODO: Check for valid state anyway? */
333 size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
334 size_t len, mbstate_t *__restrict ps)
336 return __wcsnrtombs(dst, src, SIZE_MAX, len, ps);
340 /**********************************************************************/
341 #ifdef L__wchar_utf8sntowcs
343 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
344 * UTF-8-test.txt strss test.
346 /* #define DECODER */
354 size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
355 const char **__restrict src, size_t n,
356 mbstate_t *ps, int allow_continuation)
358 register const char *s;
371 /* NOTE: The following is an AWFUL HACK! In order to support %s in
372 * wprintf, we need to be able to compute the number of wchars needed
373 * for the mbs conversion, not to exceed the precision specified.
374 * But if dst is NULL, the return value is the length assuming a
375 * sufficiently sized buffer. So, we allow passing of (wchar_t *) ps
376 * as pwc in order to flag that we really want the length, subject
377 * to the restricted buffer size and no partial conversions.
378 * See mbsnrtowcs() as well. */
379 if (!pwc || (pwc == ((wchar_t *)ps))) {
387 /* This is really here only to support the glibc extension function
388 * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
389 * check on the validity of the mbstate. */
394 if ((mask = (__uwchar_t) ps->mask) != 0) { /* A continuation... */
396 wc = (__uwchar_t) ps->wc;
402 if ((wc = (__uwchar_t) ps->wc) != 0xffffU) {
403 /* TODO: change error code here and below? */
409 return (size_t) -1; /* We're in an error state. */
418 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
420 #ifdef __UCLIBC_MJN3_ONLY__
421 #warning fix range for 16 bit wides
423 if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) {
433 return (size_t) -1; /* Illegal start byte! */
439 if ((*s & 0xc0) != 0x80) {
444 wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */
449 if ((wc & mask) == 0) { /* Character completed. */
450 if ((mask >>= 5) == 0x40) {
453 /* Check for invalid sequences (longer than necessary)
454 * and invalid chars. */
455 if ( (wc < mask) /* Sequence not minimal length. */
457 #if UTF_8_MAX_LEN == 3
458 #error broken since mask can overflow!!
459 /* For plane 0, these are the only defined values.*/
462 /* Note that we don't need to worry about exceeding */
463 /* 31 bits as that is the most that UTF-8 provides. */
464 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
466 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
474 /* Character potentially valid but incomplete. */
475 if (!allow_continuation) {
479 /* NOTE: The following can fail if you allow and then disallow
481 #if UTF_8_MAX_LEN == 3
482 #error broken since mask can overflow!!
484 /* Need to back up... */
487 } while ((mask >>= 5) >= 0x40);
490 ps->mask = (wchar_t) mask;
491 ps->wc = (wchar_t) wc;
503 while (wc && --count);
511 /* ps->wc is irrelavent here. */
521 /**********************************************************************/
522 #ifdef L__wchar_wcsntoutf8s
524 size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
525 const wchar_t **__restrict src, size_t wn)
530 const __uwchar_t *swc;
532 char buf[MB_LEN_MAX];
536 /* NOTE: The following is an AWFUL HACK! In order to support %ls in
537 * printf, we need to be able to compute the number of bytes needed
538 * for the mbs conversion, not to exceed the precision specified.
539 * But if dst is NULL, the return value is the length assuming a
540 * sufficiently sized buffer. So, we allow passing of (char *) src
541 * as dst in order to flag that we really want the length, subject
542 * to the restricted buffer size and no partial conversions.
543 * See wcsnrtombs() as well. */
544 if (!s || (s == ((char *) src))) {
553 swc = (const __uwchar_t *) *src;
566 #if UTF_8_MAX_LEN == 3
567 /* For plane 0, these are the only defined values.*/
568 /* Note that we don't need to worry about exceeding */
569 /* 31 bits as that is the most that UTF-8 provides. */
572 /* UTF_8_MAX_LEN == 6 */
574 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
576 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
581 #if UTF_8_MAX_LEN != 3
582 if (wc > 0x7fffffffUL) { /* Value too large. */
594 if ((len = p - s) > t) { /* Not enough space. */
601 *--p = (wc & 0x3f) | 0x80;
605 } else if (wc == 0) { /* End of string. */
619 *src = (const wchar_t *) swc;
627 /**********************************************************************/
628 #ifdef L___mbsnrtowcs
630 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
632 size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
633 size_t NMC, size_t len, mbstate_t *__restrict ps)
634 __attribute__ ((__weak__, __alias__("__mbsnrtowcs")));
636 size_t __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
637 size_t NMC, size_t len, mbstate_t *__restrict ps)
639 static mbstate_t mbstate; /* Rely on bss 0-init. */
649 #ifdef __CTYPE_HAS_UTF_8_LOCALES
650 if (ENCODING == __ctype_encoding_utf8) {
652 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
653 != (size_t) -2) ? r : 0;
657 /* NOTE: The following is an AWFUL HACK! In order to support %s in
658 * wprintf, we need to be able to compute the number of wchars needed
659 * for the mbs conversion, not to exceed the precision specified.
660 * But if dst is NULL, the return value is the length assuming a
661 * sufficiently sized buffer. So, we allow passing of ((wchar_t *)ps)
662 * as dst in order to flag that we really want the length, subject
663 * to the restricted buffer size and no partial conversions.
664 * See _wchar_utf8sntowcs() as well. */
665 if (!dst || (dst == ((wchar_t *)ps))) {
673 /* Since all the following encodings are single-byte encodings... */
681 #ifdef __CTYPE_HAS_8_BIT_LOCALES
682 if (ENCODING == __ctype_encoding_8_bit) {
685 if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */
687 wc = __global_locale.tbl8c2wc[
688 (__global_locale.idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
689 << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
709 #ifdef __UCLIBC_HAS_LOCALE__
710 assert(ENCODING == __ctype_encoding_7_bit);
714 if ((*dst = (unsigned char) *s) == 0) {
719 #ifdef __CTYPE_HAS_8_BIT_LOCALES
736 /**********************************************************************/
737 #ifdef L___wcsnrtombs
739 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
741 /* Note: We completely ignore ps in all currently supported conversions.
742 * TODO: Check for valid state anyway? */
744 size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
745 size_t NWC, size_t len, mbstate_t *__restrict ps)
746 __attribute__ ((__weak__, __alias__("__wcsnrtombs")));
748 size_t __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
749 size_t NWC, size_t len, mbstate_t *__restrict ps)
754 char buf[MB_LEN_MAX];
756 #ifdef __CTYPE_HAS_UTF_8_LOCALES
757 if (ENCODING == __ctype_encoding_utf8) {
758 return _wchar_wcsntoutf8s(dst, len, src, NWC);
760 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
763 /* NOTE: The following is an AWFUL HACK! In order to support %ls in
764 * printf, we need to be able to compute the number of bytes needed
765 * for the mbs conversion, not to exceed the precision specified.
766 * But if dst is NULL, the return value is the length assuming a
767 * sufficiently sized buffer. So, we allow passing of (char *) src
768 * as dst in order to flag that we really want the length, subject
769 * to the restricted buffer size and no partial conversions.
770 * See _wchar_wcsntoutf8s() as well. */
771 if (!dst || (dst == ((char *) src))) {
779 /* Since all the following encodings are single-byte encodings... */
785 s = (const __uwchar_t *) *src;
787 #ifdef __CTYPE_HAS_8_BIT_LOCALES
788 if (ENCODING == __ctype_encoding_8_bit) {
792 if ((wc = *s) <= 0x7f) {
793 if (!(*dst = (unsigned char) wc)) {
799 if (wc <= Cwc2c_DOMAIN_MAX) {
800 u = __global_locale.idx8wc2c[wc >> (Cwc2c_TI_SHIFT
802 u = __global_locale.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
803 + ((wc >> Cwc2c_TT_SHIFT)
804 & ((1 << Cwc2c_TI_SHIFT)-1))];
805 u = __global_locale.tbl8wc2c[Cwc2c_TI_LEN
806 + (u << Cwc2c_TT_SHIFT)
807 + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
810 #define __WCHAR_REPLACEMENT_CHAR '?'
811 #ifdef __WCHAR_REPLACEMENT_CHAR
812 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
813 #else /* __WCHAR_REPLACEMENT_CHAR */
817 *dst = (unsigned char) u;
818 #endif /* __WCHAR_REPLACEMENT_CHAR */
825 *src = (const wchar_t *) s;
829 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
831 #ifdef __UCLIBC_HAS_LOCALE__
832 assert(ENCODING == __ctype_encoding_7_bit);
837 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
843 if ((*dst = (unsigned char) *s) == 0) {
852 *src = (const wchar_t *) s;
858 /**********************************************************************/
861 #ifdef __UCLIBC_MJN3_ONLY__
862 #warning if we start doing translit, wcwidth and wcswidth will need updating.
865 #if defined(__UCLIBC_HAS_LOCALE__) && \
866 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
868 static const unsigned char new_idx[] = {
869 0, 5, 5, 6, 10, 15, 28, 39,
870 48, 48, 71, 94, 113, 128, 139, 154,
871 175, 186, 188, 188, 188, 188, 188, 188,
872 203, 208, 208, 208, 208, 208, 208, 208,
873 208, 219, 219, 219, 222, 222, 222, 222,
874 222, 222, 222, 222, 222, 222, 222, 224,
875 224, 231, 231, 231, 231, 231, 231, 231,
876 231, 231, 231, 231, 231, 231, 231, 231,
877 231, 231, 231, 231, 231, 231, 231, 231,
878 231, 231, 231, 231, 231, 231, 231, 231,
879 231, 231, 231, 231, 231, 231, 231, 231,
880 231, 231, 231, 231, 231, 231, 231, 231,
881 231, 231, 231, 231, 231, 231, 231, 231,
882 231, 231, 231, 231, 231, 231, 231, 231,
883 231, 231, 231, 231, 231, 231, 231, 231,
884 231, 231, 231, 231, 231, 231, 231, 231,
885 231, 231, 231, 231, 231, 231, 231, 231,
886 231, 231, 231, 231, 231, 231, 231, 231,
887 231, 231, 231, 231, 231, 231, 231, 231,
888 231, 231, 231, 231, 231, 231, 231, 231,
889 231, 231, 231, 231, 231, 233, 233, 233,
890 233, 233, 233, 233, 234, 234, 234, 234,
891 234, 234, 234, 234, 234, 234, 234, 234,
892 234, 234, 234, 234, 234, 234, 234, 234,
893 234, 234, 234, 234, 234, 234, 234, 234,
894 234, 234, 234, 234, 234, 234, 234, 234,
895 234, 234, 234, 234, 234, 234, 234, 234,
896 236, 236, 236, 236, 236, 236, 236, 236,
897 236, 236, 236, 236, 236, 236, 236, 236,
898 236, 236, 236, 236, 236, 236, 236, 236,
899 236, 236, 236, 236, 236, 236, 236, 236,
900 236, 237, 237, 238, 241, 241, 242, 249,
904 static const unsigned char new_tbl[] = {
905 0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
906 0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
907 0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
908 0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
909 0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
910 0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
911 0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
912 0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
913 0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
914 0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
915 0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
916 0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
917 0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
918 0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
919 0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
920 0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
921 0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
922 0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
923 0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
924 0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
925 0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
926 0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
927 0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
928 0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
929 0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
930 0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
931 0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
932 0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
933 0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
934 0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
935 0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
936 0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
939 static const signed char new_wtbl[] = {
940 0, -1, 1, -1, 1, 1, 0, 1,
941 0, 1, 1, 0, 1, 0, 1, 1,
942 0, 1, 0, 1, 0, 1, 0, 1,
943 0, 1, 0, 1, 1, 0, 1, 0,
944 1, 0, 1, 0, 1, 0, 1, 1,
945 0, 1, 0, 1, 0, 1, 0, 1,
946 1, 0, 1, 0, 1, 0, 1, 0,
947 1, 0, 1, 0, 1, 0, 1, 0,
948 1, 0, 1, 0, 1, 0, 1, 1,
949 0, 1, 0, 1, 0, 1, 0, 1,
950 0, 1, 0, 1, 0, 1, 0, 1,
951 0, 1, 0, 1, 0, 1, 1, 0,
952 1, 0, 1, 0, 1, 0, 1, 0,
953 1, 0, 1, 0, 1, 0, 1, 0,
954 1, 1, 0, 1, 0, 1, 0, 1,
955 0, 1, 0, 1, 0, 1, 0, 1,
956 1, 0, 1, 0, 1, 0, 1, 0,
957 1, 0, 1, 1, 0, 1, 0, 1,
958 0, 1, 0, 1, 0, 1, 0, 1,
959 0, 1, 1, 0, 1, 0, 1, 0,
960 1, 0, 1, 0, 1, 0, 1, 0,
961 1, 0, 1, 0, 1, 0, 1, 1,
962 0, 1, 0, 1, 0, 1, 0, 1,
963 0, 1, 2, 0, 1, 0, 1, 0,
964 1, 0, 1, 0, 1, 0, 1, 0,
965 1, 0, 1, 1, 0, 1, 0, 1,
966 1, 0, 1, 0, 1, 0, 1, 0,
967 1, 0, 1, 1, 2, 1, 1, 2,
968 2, 0, 2, 1, 2, 0, 2, 2,
969 1, 1, 2, 1, 1, 2, 1, 0,
970 1, 1, 0, 1, 0, 1, 2, 1,
974 int wcswidth(const wchar_t *pwcs, size_t n)
980 if (ENCODING == __ctype_encoding_7_bit) {
983 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
984 if (pwcs[i] != ((unsigned char)(pwcs[i]))) {
989 #ifdef __CTYPE_HAS_8_BIT_LOCALES
990 else if (ENCODING == __ctype_encoding_8_bit) {
993 mbstate.mask = 0; /* Initialize the mbstate. */
994 if (__wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
998 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
999 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1000 /* For stricter handling of allowed unicode values... see comments above. */
1001 else if (ENCODING == __ctype_encoding_utf8) {
1004 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1005 if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1006 || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1012 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1014 for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1016 /* If we're here, wc != 0. */
1017 if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1023 if (((unsigned int) wc) <= 0xffff) {
1028 while ((m = (l+h) >> 1) != l) {
1029 if (b >= new_tbl[m]) {
1031 } else { /* wc < tbl[m] */
1035 count += new_wtbl[l]; /* none should be -1. */
1039 /* Redo this to minimize average number of compares?*/
1040 if (wc >= 0x1d167) {
1041 if (wc <= 0x1d1ad) {
1047 || (wc >= 0x1d1aa))))))
1051 } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1053 } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1054 ++count; /* need 2.. add one here */
1056 #if (WCHAR_MAX > 0x7fffffffL)
1057 else if (wc > 0x7fffffffL) {
1060 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1069 #else /* __UCLIBC_HAS_LOCALE__ */
1071 int wcswidth(const wchar_t *pwcs, size_t n)
1076 for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1078 /* If we're here, wc != 0. */
1079 if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1092 #endif /* __UCLIBC_HAS_LOCALE__ */
1095 /**********************************************************************/
1098 int wcwidth(wchar_t wc)
1100 return wcswidth(&wc, 1);
1104 /**********************************************************************/
1109 mbstate_t fromstate;
1117 int skip_invalid_input; /* To support iconv -c option. */
1127 #include <byteswap.h>
1129 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1130 #error unsupported endianness for iconv
1133 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1134 #error currently iconv requires 8 bit locales
1136 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1137 #error currently iconv requires UTF-8 locales
1143 IC_MULTIBYTE = 0xe0,
1144 #if __BYTE_ORDER == __BIG_ENDIAN
1159 /* For the multibyte
1160 * bit 0 means swap endian
1161 * bit 1 means 2 byte
1162 * bit 2 means 4 byte
1166 const unsigned char codesets[] =
1167 "\x0a\xe0""WCHAR_T\x00" /* superset of UCS-4 but platform-endian */
1168 #if __BYTE_ORDER == __BIG_ENDIAN
1169 "\x08\xec""UCS-4\x00" /* always BE */
1170 "\x0a\xec""UCS-4BE\x00"
1171 "\x0a\xed""UCS-4LE\x00"
1172 "\x09\fe4""UTF-32\x00" /* platform endian with BOM */
1173 "\x0b\xe4""UTF-32BE\x00"
1174 "\x0b\xe5""UTF-32LE\x00"
1175 "\x08\xe2""UCS-2\x00" /* always BE */
1176 "\x0a\xe2""UCS-2BE\x00"
1177 "\x0a\xe3""UCS-2LE\x00"
1178 "\x09\xea""UTF-16\x00" /* platform endian with BOM */
1179 "\x0b\xea""UTF-16BE\x00"
1180 "\x0b\xeb""UTF-16LE\x00"
1181 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1182 "\x08\xed""UCS-4\x00" /* always BE */
1183 "\x0a\xed""UCS-4BE\x00"
1184 "\x0a\xec""UCS-4LE\x00"
1185 "\x09\xf4""UTF-32\x00" /* platform endian with BOM */
1186 "\x0b\xe5""UTF-32BE\x00"
1187 "\x0b\xe4""UTF-32LE\x00"
1188 "\x08\xe3""UCS-2\x00" /* always BE */
1189 "\x0a\xe3""UCS-2BE\x00"
1190 "\x0a\xe2""UCS-2LE\x00"
1191 "\x09\xfa""UTF-16\x00" /* platform endian with BOM */
1192 "\x0b\xeb""UTF-16BE\x00"
1193 "\x0b\xea""UTF-16LE\x00"
1195 "\x08\x02""UTF-8\x00"
1196 "\x0b\x01""US-ASCII\x00"
1197 "\x07\x01""ASCII"; /* Must be last! (special case to save a nul) */
1199 static int find_codeset(const char *name)
1201 const unsigned char *s;
1204 for (s = codesets ; *s ; s += *s) {
1205 if (!strcasecmp(s+2, name)) {
1210 /* The following is ripped from find_locale in locale.c. */
1212 /* TODO: maybe CODESET_LIST + *s ??? */
1213 /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1217 ++codeset; /* Increment codeset first. */
1218 if (!strcasecmp(CODESET_LIST+*s, name)) {
1223 return 0; /* No matching codeset! */
1226 iconv_t iconv_open(const char *tocode, const char *fromcode)
1228 register _UC_iconv_t *px;
1229 int tocodeset, fromcodeset;
1231 if (((tocodeset = find_codeset(tocode)) != 0)
1232 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1233 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1234 px->tocodeset = tocodeset;
1235 px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
1236 px->fromcodeset0 = px->fromcodeset = fromcodeset;
1237 px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
1238 px->skip_invalid_input = px->tostate.mask = px->fromstate.mask = 0;
1239 return (iconv_t) px;
1242 __set_errno(EINVAL);
1244 return (iconv_t)(-1);
1247 int iconv_close(iconv_t cd)
1254 size_t iconv(iconv_t cd, char **__restrict inbuf,
1255 size_t *__restrict inbytesleft,
1256 char **__restrict outbuf, size_t *__restrict outbytesleft)
1258 _UC_iconv_t *px = (_UC_iconv_t *) cd;
1263 assert(px != (_UC_iconv_t *)(-1));
1264 assert(sizeof(wchar_t) == 4);
1266 if (!inbuf || !*inbuf) { /* Need to reinitialze conversion state. */
1267 /* Note: For shift-state encodings we possibly need to output the
1268 * shift sequence to return to initial state! */
1269 if ((px->fromcodeset & 0xf0) == 0xe0) {
1271 px->tostate.mask = px->fromstate.mask = 0;
1272 px->fromcodeset = px->fromcodeset0;
1273 px->tobom = px->tobom0;
1274 px->frombom = px->frombom0;
1279 while (*inbytesleft) {
1280 if (!*outbytesleft) {
1287 if (px->fromcodeset >= IC_MULTIBYTE) {
1288 inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1289 if (*inbytesleft < inci) goto INVALID;
1290 wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1291 + ((unsigned char)((*inbuf)[1]));
1293 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1294 + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1295 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1297 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1298 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1299 && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1302 if (*inbytesleft < 4) goto INVALID;
1303 wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1304 + ((unsigned char)((*inbuf)[3]));
1305 if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1306 if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1309 inci = 4; /* Change inci here in case skipping illegals. */
1310 wc = 0x10000UL + (wc << 10) + wc2;
1317 || (wc == ((inci == 4)
1318 ? (((wchar_t) 0xfffe0000UL))
1319 : ((wchar_t)(0xfffeUL))))
1321 if (wc != 0xfeffU) {
1322 px->fromcodeset ^= 1; /* toggle endianness */
1326 goto BOM_SKIP_OUTPUT;
1332 if (px->fromcodeset != IC_WCHAR_T) {
1333 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1334 ? 0x7fffffffUL : 0x10ffffUL)
1336 || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1337 || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1343 } else if (px->fromcodeset == IC_UTF_8) {
1344 const char *p = *inbuf;
1345 r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1346 if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1347 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1348 assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1349 if (r == (size_t)(-2)) {
1351 __set_errno(EINVAL);
1353 px->fromstate.mask = 0;
1356 if (px->skip_invalid_input) {
1357 px->skip_invalid_input = 2; /* flag for iconv utility */
1358 goto BOM_SKIP_OUTPUT;
1360 __set_errno(EILSEQ);
1362 return (size_t)(-1);
1364 #ifdef __UCLIBC_MJN3_ONLY__
1365 #warning optimize this
1367 if (p != NULL) { /* incomplet char case */
1370 p = *inbuf + 1; /* nul */
1373 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1374 if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1376 } else { /* some other 8-bit ascii-extension codeset */
1377 const codeset_8_bit_t *c8b
1378 = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1380 wc = __global_locale.tbl8c2wc[
1381 (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1382 << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1397 if (px->tocodeset >= IC_MULTIBYTE) {
1398 inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1399 if (*outbytesleft < inci) goto TOO_BIG;
1400 if (px->tocodeset != IC_WCHAR_T) {
1401 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1402 ? 0x7fffffffUL : 0x10ffffUL)
1404 || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1405 || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1414 if (px->tocodeset & 1) wc = bswap_32(wc);
1416 if (((__uwchar_t)wc ) > 0xffffU) {
1417 if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1420 if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1421 wc2 = 0xdc00U + (wc & 0x3ff);
1422 wc = 0xd800U + ((wc >> 10) & 0x3ff);
1423 if (px->tocodeset & 1) {
1425 wc2 = bswap_16(wc2);
1428 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1430 (*outbuf)[0] = (char)((unsigned char)(wc));
1431 (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1433 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1434 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1436 } else if (px->tocodeset == IC_UTF_8) {
1437 const wchar_t *pw = &wc;
1439 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1440 if (r != (size_t)(-1)) {
1441 #ifdef __UCLIBC_MJN3_ONLY__
1442 #warning what happens for a nul?
1456 } else if (((__uwchar_t)(wc)) < 0x80) {
1460 if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1461 const codeset_8_bit_t *c8b
1462 = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1464 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1465 u = __global_locale.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1466 + ((wc >> Cwc2c_TT_SHIFT)
1467 & ((1 << Cwc2c_TI_SHIFT)-1))];
1468 wc = __global_locale.tbl8wc2c[Cwc2c_TI_LEN
1469 + (u << Cwc2c_TT_SHIFT)
1470 + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1480 *outbytesleft -= inco;
1483 *inbytesleft -= inci;
1489 /**********************************************************************/
1500 extern const unsigned char codesets[];
1508 static void error_msg(const char *fmt, ...)
1509 __attribute__ ((noreturn, format (printf, 1, 2)));
1511 static void error_msg(const char *fmt, ...)
1516 fprintf(stderr, "%s: ", progname);
1518 vfprintf(stderr, fmt, arg);
1525 int main(int argc, char **argv)
1528 FILE *ofile = stdout;
1531 static const char opt_chars[] = "tfocsl";
1533 const char *opts[sizeof(opt_chars)]; /* last is infile name */
1539 size_t ni, no, r, pos;
1543 for (s = opt_chars ; *s ; s++) {
1544 opts[ s - opt_chars ] = NULL;
1550 if ((*p != '-') || (*++p == 0)) {
1554 if ((s = strchr(opt_chars,*p)) == NULL) {
1556 s = basename(progname);
1558 "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
1559 " or\n%s -l\n", s, s);
1560 return EXIT_FAILURE;
1562 if ((s - opt_chars) < 3) {
1563 if ((--argc == 0) || opts[s - opt_chars]) {
1566 opts[s - opt_chars] = *++argv;
1568 opts[s - opt_chars] = p;
1573 if (opts[5]) { /* -l */
1574 fprintf(stderr, "Recognized codesets:\n");
1575 for (s = codesets ; *s ; s += *s) {
1576 fprintf(stderr," %s\n", s+2);
1580 fprintf(stderr," %s\n", CODESET_LIST+ (unsigned char)(*s));
1583 return EXIT_SUCCESS;
1590 if (!opts[0] || !opts[1]) {
1593 if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
1594 error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
1596 if (opts[3]) { /* -c */
1597 ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
1600 if ((s = opts[2]) != NULL) {
1601 if (!(ofile = fopen(s, "w"))) {
1602 error_msg( "couldn't open %s for writing\n", s);
1608 if (!argc || ((**argv == '-') && !((*argv)[1]))) {
1609 ifile = stdin; /* we don't check for duplicates */
1610 } else if (!(ifile = fopen(*argv, "r"))) {
1611 error_msg( "couldn't open %s for reading\n", *argv);
1614 while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
1620 if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
1621 if ((errno != EINVAL) && (errno != E2BIG)) {
1622 error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
1625 if ((r = OBUF - no) > 0) {
1626 if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
1627 error_msg( "write error\n");
1630 if (ni) { /* still bytes in buffer! */
1631 memmove(ibuf, pi, ni);
1635 if (ferror(ifile)) {
1636 error_msg( "read error\n");
1641 if (ifile != stdin) {
1645 } while (--argc > 0);
1650 error_msg( "incomplete sequence\n");
1653 return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
1654 ? EXIT_SUCCESS : EXIT_FAILURE;
1658 /**********************************************************************/