2 /* Copyright (C) 2002, 2003, 2004 Manuel Novoa III
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the Free
16 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 /* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION!
21 * Besides uClibc, I'm using this code in my libc for elks, which is
22 * a 16-bit environment with a fairly limited compiler. It would make
23 * things much easier for me if this file isn't modified unnecessarily.
24 * In particular, please put any new or replacement functions somewhere
25 * else, and modify the makefile to use your version instead.
28 * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */
31 /* May 23, 2002 Initial Notes:
33 * I'm still tweaking this stuff, but it passes the tests I've thrown
34 * at it, and Erik needs it for the gcc port. The glibc extension
35 * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
36 * in the glibc source. I also need to fix the behavior of
37 * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
39 * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
40 * file on my platform (x86) show about 5-10% faster conversion speed than
41 * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
42 * individual mbrtowc()/wcrtomb() calls.
44 * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
45 * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which
46 * needs to deal gracefully with whatever is sent to it. In that mode,
47 * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add
48 * an arg to force that behavior, so the interface will be changing.
50 * I need to fix the error checking for 16-bit wide chars. This isn't
51 * an issue for uClibc, but may be for ELKS. I'm currently not sure
52 * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
56 * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
57 * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
59 * Enabled building of a C/POSIX-locale-only version, so full locale support
60 * no longer needs to be enabled.
64 * Fixed a bug in _wchar_wcsntoutf8s(). Don't store wcs position if dst is NULL.
65 * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
66 * order to support %ls in printf. See comments below for details.
67 * Change behaviour of wc<->mb functions when in the C locale. Now they do
68 * a 1-1 map for the range 0x80-UCHAR_MAX. This is for backwards compatibility
69 * and consistency with the stds requirements that a printf format string by
70 * a valid multibyte string beginning and ending in it's initial shift state.
74 * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
78 * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
79 * Added some size/speed optimizations and integrated it into my locale
80 * framework. Minimally tested at the moment, but the stub C-locale
81 * version (which most people would probably be using) should be fine.
85 * Revert the wc<->mb changes from earlier this month involving the C-locale.
86 * Add a couple of ugly hacks to support *wprintf.
87 * Add a mini iconv() and iconv implementation (requires locale support).
90 * Bug fix for mbrtowc.
93 * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
96 * Bug fix: Fix size check for remaining output space in iconv().
105 #include <inttypes.h>
111 #include <bits/uClibc_uwchar.h>
113 /**********************************************************************/
114 #ifdef __UCLIBC_HAS_LOCALE__
115 #ifdef __UCLIBC_MJN3_ONLY__
117 /* generates one warning */
118 #warning TODO: Fix Cc2wc* and Cwc2c* defines!
120 #endif /* __UCLIBC_MJN3_ONLY__ */
122 #define ENCODING (__UCLIBC_CURLOCALE->encoding)
124 #define Cc2wc_IDX_SHIFT __LOCALE_DATA_Cc2wc_IDX_SHIFT
125 #define Cc2wc_ROW_LEN __LOCALE_DATA_Cc2wc_ROW_LEN
126 #define Cwc2c_DOMAIN_MAX __LOCALE_DATA_Cwc2c_DOMAIN_MAX
127 #define Cwc2c_TI_SHIFT __LOCALE_DATA_Cwc2c_TI_SHIFT
128 #define Cwc2c_TT_SHIFT __LOCALE_DATA_Cwc2c_TT_SHIFT
129 #define Cwc2c_TI_LEN __LOCALE_DATA_Cwc2c_TI_LEN
131 #ifndef __CTYPE_HAS_UTF_8_LOCALES
132 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
135 #else /* __UCLIBC_HAS_LOCALE__ */
137 #ifdef __UCLIBC_MJN3_ONLY__
140 #warning fix preprocessor logic testing locale settings
144 #define ENCODING (__ctype_encoding_7_bit)
145 #ifdef __CTYPE_HAS_8_BIT_LOCALES
146 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
148 #ifdef __CTYPE_HAS_UTF_8_LOCALES
149 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
151 #undef L__wchar_utf8sntowcs
152 #undef L__wchar_wcsntoutf8s
154 #endif /* __UCLIBC_HAS_LOCALE__ */
155 /**********************************************************************/
157 #if WCHAR_MAX > 0xffffUL
158 #define UTF_8_MAX_LEN 6
160 #define UTF_8_MAX_LEN 3
165 /* Implementation-specific work functions. */
167 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
168 const char **__restrict src, size_t n,
169 mbstate_t *ps, int allow_continuation) attribute_hidden;
171 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
172 const wchar_t **__restrict src, size_t wn) attribute_hidden;
174 /**********************************************************************/
180 #ifdef __CTYPE_HAS_8_BIT_LOCALES
183 unsigned char buf[1];
187 *buf = (unsigned char) c;
188 mbstate.__mask = 0; /* Initialize the mbstate. */
189 if (mbrtowc(&wc, (char*) buf, 1, &mbstate) <= 1) {
195 #else /* !__CTYPE_HAS_8_BIT_LOCALES */
197 #ifdef __UCLIBC_HAS_LOCALE__
198 assert((ENCODING == __ctype_encoding_7_bit)
199 || (ENCODING == __ctype_encoding_utf8));
202 /* If we don't have 8-bit locale support, then this is trivial since
203 * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
204 return (((unsigned int)c) < 0x80) ? c : WEOF;
206 #endif /* !__CTYPE_HAS_8_BIT_LOCALES */
208 libc_hidden_def(btowc)
211 /**********************************************************************/
214 /* Note: We completely ignore ps in all currently supported conversions. */
219 #ifdef __CTYPE_HAS_8_BIT_LOCALES
221 unsigned char buf[MB_LEN_MAX];
223 return (wcrtomb((char*) buf, c, NULL) == 1) ? *buf : EOF;
225 #else /* __CTYPE_HAS_8_BIT_LOCALES */
227 #ifdef __UCLIBC_HAS_LOCALE__
228 assert((ENCODING == __ctype_encoding_7_bit)
229 || (ENCODING == __ctype_encoding_utf8));
230 #endif /* __UCLIBC_HAS_LOCALE__ */
232 /* If we don't have 8-bit locale support, then this is trivial since
233 * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
235 /* TODO: need unsigned version of wint_t... */
236 /* return (((unsigned int)c) < 0x80) ? c : WEOF; */
237 return ((c >= 0) && (c < 0x80)) ? c : EOF;
239 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
243 /**********************************************************************/
246 int mbsinit(const mbstate_t *ps)
248 return !ps || !ps->__mask;
250 libc_hidden_def(mbsinit)
253 /**********************************************************************/
257 size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
259 static mbstate_t mbstate; /* Rely on bss 0-init. */
261 return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
263 libc_hidden_def(mbrlen)
266 /**********************************************************************/
270 size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
271 size_t n, mbstate_t *__restrict ps)
273 static mbstate_t mbstate; /* Rely on bss 0-init. */
277 char empty_string[1]; /* Avoid static to be fPIC friendly. */
284 pwc = (wchar_t *) s; /* NULL */
285 empty_string[0] = 0; /* Init the empty string when necessary. */
288 } else if (*s == '\0') {
291 /* According to the ISO C 89 standard this is the expected behaviour. */
294 /* TODO: change error code? */
296 return (ps->__mask && (ps->__wc == 0xffffU))
297 ? ((size_t) -1) : ((size_t) -2);
305 #ifdef __CTYPE_HAS_UTF_8_LOCALES
306 /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
307 if (ENCODING == __ctype_encoding_utf8) {
311 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
312 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
316 #ifdef __UCLIBC_MJN3_ONLY__
317 #warning TODO: This adds a trailing nul!
318 #endif /* __UCLIBC_MJN3_ONLY__ */
320 r = mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
322 if (((ssize_t) r) >= 0) {
329 libc_hidden_def(mbrtowc)
332 /**********************************************************************/
336 /* Note: We completely ignore ps in all currently supported conversions. */
337 /* TODO: Check for valid state anyway? */
339 size_t wcrtomb(register char *__restrict s, wchar_t wc,
340 mbstate_t *__restrict ps)
342 #ifdef __UCLIBC_MJN3_ONLY__
343 #warning TODO: Should wcsnrtombs nul-terminate unconditionally? Check glibc.
344 #endif /* __UCLIBC_MJN3_ONLY__ */
348 char buf[MB_LEN_MAX];
358 r = wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
359 return (r != 0) ? r : 1;
361 libc_hidden_def(wcrtomb)
364 /**********************************************************************/
368 size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
369 size_t len, mbstate_t *__restrict ps)
371 static mbstate_t mbstate; /* Rely on bss 0-init. */
373 return mbsnrtowcs(dst, src, SIZE_MAX, len,
374 ((ps != NULL) ? ps : &mbstate));
376 libc_hidden_def(mbsrtowcs)
379 /**********************************************************************/
382 /* Note: We completely ignore ps in all currently supported conversions.
384 * TODO: Check for valid state anyway? */
387 size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
388 size_t len, mbstate_t *__restrict ps)
390 return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
392 libc_hidden_def(wcsrtombs)
395 /**********************************************************************/
396 #ifdef L__wchar_utf8sntowcs
398 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
399 * UTF-8-test.txt strss test.
401 /* #define DECODER */
409 size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
410 const char **__restrict src, size_t n,
411 mbstate_t *ps, int allow_continuation)
413 register const char *s;
426 /* NOTE: The following is an AWFUL HACK! In order to support %s in
427 * wprintf, we need to be able to compute the number of wchars needed
428 * for the mbs conversion, not to exceed the precision specified.
429 * But if dst is NULL, the return value is the length assuming a
430 * sufficiently sized buffer. So, we allow passing of (wchar_t *) ps
431 * as pwc in order to flag that we really want the length, subject
432 * to the restricted buffer size and no partial conversions.
433 * See mbsnrtowcs() as well. */
434 if (!pwc || (pwc == ((wchar_t *)ps))) {
442 /* This is really here only to support the glibc extension function
443 * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
444 * check on the validity of the mbstate. */
449 if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
451 wc = (__uwchar_t) ps->__wc;
457 if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
458 /* TODO: change error code here and below? */
465 return (size_t) -1; /* We're in an error state. */
474 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
476 #ifdef __UCLIBC_MJN3_ONLY__
477 #warning TODO: Fix range for 16 bit wchar_t case.
479 if (( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) &&
480 (((unsigned char)s[-1] != 0xc0 ) && ((unsigned char)s[-1] != 0xc1 ))) {
491 return (size_t) -1; /* Illegal start byte! */
497 if ((*s & 0xc0) != 0x80) {
502 wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */
507 if ((wc & mask) == 0) { /* Character completed. */
508 if ((mask >>= 5) == 0x40) {
511 /* Check for invalid sequences (longer than necessary)
512 * and invalid chars. */
513 if ( (wc < mask) /* Sequence not minimal length. */
515 #if UTF_8_MAX_LEN == 3
516 #error broken since mask can overflow!!
517 /* For plane 0, these are the only defined values.*/
520 /* Note that we don't need to worry about exceeding */
521 /* 31 bits as that is the most that UTF-8 provides. */
522 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
524 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
532 /* Character potentially valid but incomplete. */
533 if (!allow_continuation) {
537 /* NOTE: The following can fail if you allow and then disallow
539 #if UTF_8_MAX_LEN == 3
540 #error broken since mask can overflow!!
542 /* Need to back up... */
545 } while ((mask >>= 5) >= 0x40);
548 ps->__mask = (wchar_t) mask;
549 ps->__wc = (wchar_t) wc;
560 while (wc && --count);
568 /* ps->__wc is irrelavent here. */
578 /**********************************************************************/
579 #ifdef L__wchar_wcsntoutf8s
581 size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
582 const wchar_t **__restrict src, size_t wn)
587 const __uwchar_t *swc;
589 char buf[MB_LEN_MAX];
593 /* NOTE: The following is an AWFUL HACK! In order to support %ls in
594 * printf, we need to be able to compute the number of bytes needed
595 * for the mbs conversion, not to exceed the precision specified.
596 * But if dst is NULL, the return value is the length assuming a
597 * sufficiently sized buffer. So, we allow passing of (char *) src
598 * as dst in order to flag that we really want the length, subject
599 * to the restricted buffer size and no partial conversions.
600 * See wcsnrtombs() as well. */
601 if (!s || (s == ((char *) src))) {
610 swc = (const __uwchar_t *) *src;
623 #if UTF_8_MAX_LEN == 3
624 /* For plane 0, these are the only defined values.*/
625 /* Note that we don't need to worry about exceeding */
626 /* 31 bits as that is the most that UTF-8 provides. */
629 /* UTF_8_MAX_LEN == 6 */
631 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
633 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
639 #if UTF_8_MAX_LEN != 3
640 if (wc > 0x7fffffffUL) { /* Value too large. */
653 if ((len = p - s) > t) { /* Not enough space. */
660 *--p = (wc & 0x3f) | 0x80;
664 } else if (wc == 0) { /* End of string. */
678 *src = (const wchar_t *) swc;
686 /**********************************************************************/
689 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
691 size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
692 size_t NMC, size_t len, mbstate_t *__restrict ps)
694 static mbstate_t mbstate; /* Rely on bss 0-init. */
704 #ifdef __CTYPE_HAS_UTF_8_LOCALES
705 if (ENCODING == __ctype_encoding_utf8) {
707 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
708 != (size_t) -2) ? r : 0;
712 /* NOTE: The following is an AWFUL HACK! In order to support %s in
713 * wprintf, we need to be able to compute the number of wchars needed
714 * for the mbs conversion, not to exceed the precision specified.
715 * But if dst is NULL, the return value is the length assuming a
716 * sufficiently sized buffer. So, we allow passing of ((wchar_t *)ps)
717 * as dst in order to flag that we really want the length, subject
718 * to the restricted buffer size and no partial conversions.
719 * See _wchar_utf8sntowcs() as well. */
720 if (!dst || (dst == ((wchar_t *)ps))) {
728 /* Since all the following encodings are single-byte encodings... */
736 #ifdef __CTYPE_HAS_8_BIT_LOCALES
737 if (ENCODING == __ctype_encoding_8_bit) {
740 if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */
742 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
743 (__UCLIBC_CURLOCALE->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
744 << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
764 #ifdef __UCLIBC_HAS_LOCALE__
765 assert(ENCODING == __ctype_encoding_7_bit);
769 if ((*dst = (unsigned char) *s) == 0) {
774 #ifdef __CTYPE_HAS_8_BIT_LOCALES
789 libc_hidden_def(mbsnrtowcs)
792 /**********************************************************************/
795 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
797 /* Note: We completely ignore ps in all currently supported conversions.
798 * TODO: Check for valid state anyway? */
800 size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
801 size_t NWC, size_t len, mbstate_t *__restrict ps)
806 char buf[MB_LEN_MAX];
808 #ifdef __CTYPE_HAS_UTF_8_LOCALES
809 if (ENCODING == __ctype_encoding_utf8) {
810 return _wchar_wcsntoutf8s(dst, len, src, NWC);
812 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
815 /* NOTE: The following is an AWFUL HACK! In order to support %ls in
816 * printf, we need to be able to compute the number of bytes needed
817 * for the mbs conversion, not to exceed the precision specified.
818 * But if dst is NULL, the return value is the length assuming a
819 * sufficiently sized buffer. So, we allow passing of (char *) src
820 * as dst in order to flag that we really want the length, subject
821 * to the restricted buffer size and no partial conversions.
822 * See _wchar_wcsntoutf8s() as well. */
823 if (!dst || (dst == ((char *) src))) {
831 /* Since all the following encodings are single-byte encodings... */
837 s = (const __uwchar_t *) *src;
839 #ifdef __CTYPE_HAS_8_BIT_LOCALES
840 if (ENCODING == __ctype_encoding_8_bit) {
844 if ((wc = *s) <= 0x7f) {
845 if (!(*dst = (unsigned char) wc)) {
851 if (wc <= Cwc2c_DOMAIN_MAX) {
852 u = __UCLIBC_CURLOCALE->idx8wc2c[wc >> (Cwc2c_TI_SHIFT
854 u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
855 + ((wc >> Cwc2c_TT_SHIFT)
856 & ((1 << Cwc2c_TI_SHIFT)-1))];
857 u = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
858 + (u << Cwc2c_TT_SHIFT)
859 + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
862 #ifdef __WCHAR_REPLACEMENT_CHAR
863 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
864 #else /* __WCHAR_REPLACEMENT_CHAR */
868 *dst = (unsigned char) u;
869 #endif /* __WCHAR_REPLACEMENT_CHAR */
876 *src = (const wchar_t *) s;
880 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
882 #ifdef __UCLIBC_HAS_LOCALE__
883 assert(ENCODING == __ctype_encoding_7_bit);
888 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
894 if ((*dst = (unsigned char) *s) == 0) {
903 *src = (const wchar_t *) s;
907 libc_hidden_def(wcsnrtombs)
910 /**********************************************************************/
914 #ifdef __UCLIBC_MJN3_ONLY__
915 #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
916 #warning TODO: Update wcwidth to match latest by Kuhn.
919 #if defined(__UCLIBC_HAS_LOCALE__) && \
920 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
922 static const unsigned char new_idx[] = {
923 0, 5, 5, 6, 10, 15, 28, 39,
924 48, 48, 71, 94, 113, 128, 139, 154,
925 175, 186, 188, 188, 188, 188, 188, 188,
926 203, 208, 208, 208, 208, 208, 208, 208,
927 208, 219, 219, 219, 222, 222, 222, 222,
928 222, 222, 222, 222, 222, 222, 222, 224,
929 224, 231, 231, 231, 231, 231, 231, 231,
930 231, 231, 231, 231, 231, 231, 231, 231,
931 231, 231, 231, 231, 231, 231, 231, 231,
932 231, 231, 231, 231, 231, 231, 231, 231,
933 231, 231, 231, 231, 231, 231, 231, 231,
934 231, 231, 231, 231, 231, 231, 231, 231,
935 231, 231, 231, 231, 231, 231, 231, 231,
936 231, 231, 231, 231, 231, 231, 231, 231,
937 231, 231, 231, 231, 231, 231, 231, 231,
938 231, 231, 231, 231, 231, 231, 231, 231,
939 231, 231, 231, 231, 231, 231, 231, 231,
940 231, 231, 231, 231, 231, 231, 231, 231,
941 231, 231, 231, 231, 231, 231, 231, 231,
942 231, 231, 231, 231, 231, 231, 231, 231,
943 231, 231, 231, 231, 231, 233, 233, 233,
944 233, 233, 233, 233, 234, 234, 234, 234,
945 234, 234, 234, 234, 234, 234, 234, 234,
946 234, 234, 234, 234, 234, 234, 234, 234,
947 234, 234, 234, 234, 234, 234, 234, 234,
948 234, 234, 234, 234, 234, 234, 234, 234,
949 234, 234, 234, 234, 234, 234, 234, 234,
950 236, 236, 236, 236, 236, 236, 236, 236,
951 236, 236, 236, 236, 236, 236, 236, 236,
952 236, 236, 236, 236, 236, 236, 236, 236,
953 236, 236, 236, 236, 236, 236, 236, 236,
954 236, 237, 237, 238, 241, 241, 242, 249,
958 static const unsigned char new_tbl[] = {
959 0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
960 0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
961 0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
962 0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
963 0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
964 0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
965 0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
966 0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
967 0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
968 0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
969 0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
970 0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
971 0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
972 0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
973 0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
974 0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
975 0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
976 0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
977 0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
978 0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
979 0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
980 0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
981 0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
982 0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
983 0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
984 0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
985 0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
986 0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
987 0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
988 0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
989 0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
990 0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
993 static const signed char new_wtbl[] = {
994 0, -1, 1, -1, 1, 1, 0, 1,
995 0, 1, 1, 0, 1, 0, 1, 1,
996 0, 1, 0, 1, 0, 1, 0, 1,
997 0, 1, 0, 1, 1, 0, 1, 0,
998 1, 0, 1, 0, 1, 0, 1, 1,
999 0, 1, 0, 1, 0, 1, 0, 1,
1000 1, 0, 1, 0, 1, 0, 1, 0,
1001 1, 0, 1, 0, 1, 0, 1, 0,
1002 1, 0, 1, 0, 1, 0, 1, 1,
1003 0, 1, 0, 1, 0, 1, 0, 1,
1004 0, 1, 0, 1, 0, 1, 0, 1,
1005 0, 1, 0, 1, 0, 1, 1, 0,
1006 1, 0, 1, 0, 1, 0, 1, 0,
1007 1, 0, 1, 0, 1, 0, 1, 0,
1008 1, 1, 0, 1, 0, 1, 0, 1,
1009 0, 1, 0, 1, 0, 1, 0, 1,
1010 1, 0, 1, 0, 1, 0, 1, 0,
1011 1, 0, 1, 1, 0, 1, 0, 1,
1012 0, 1, 0, 1, 0, 1, 0, 1,
1013 0, 1, 1, 0, 1, 0, 1, 0,
1014 1, 0, 1, 0, 1, 0, 1, 0,
1015 1, 0, 1, 0, 1, 0, 1, 1,
1016 0, 1, 0, 1, 0, 1, 0, 1,
1017 0, 1, 2, 0, 1, 0, 1, 0,
1018 1, 0, 1, 0, 1, 0, 1, 0,
1019 1, 0, 1, 1, 0, 1, 0, 1,
1020 1, 0, 1, 0, 1, 0, 1, 0,
1021 1, 0, 1, 1, 2, 1, 1, 2,
1022 2, 0, 2, 1, 2, 0, 2, 2,
1023 1, 1, 2, 1, 1, 2, 1, 0,
1024 1, 1, 0, 1, 0, 1, 2, 1,
1025 0, 2, 1, 2, 1, 0, 1,
1029 int wcswidth(const wchar_t *pwcs, size_t n)
1035 if (ENCODING == __ctype_encoding_7_bit) {
1038 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1039 if (pwcs[i] != (pwcs[i] & 0x7f)) {
1044 #ifdef __CTYPE_HAS_8_BIT_LOCALES
1045 else if (ENCODING == __ctype_encoding_8_bit) {
1048 mbstate.__mask = 0; /* Initialize the mbstate. */
1049 if (wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1053 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
1054 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1055 /* For stricter handling of allowed unicode values... see comments above. */
1056 else if (ENCODING == __ctype_encoding_utf8) {
1059 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1060 if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1061 || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1067 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1069 for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1071 /* If we're here, wc != 0. */
1072 if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1078 if (((unsigned int) wc) <= 0xffff) {
1083 while ((m = (l+h) >> 1) != l) {
1084 if (b >= new_tbl[m]) {
1086 } else { /* wc < tbl[m] */
1090 count += new_wtbl[l]; /* none should be -1. */
1094 /* Redo this to minimize average number of compares?*/
1095 if (wc >= 0x1d167) {
1096 if (wc <= 0x1d1ad) {
1102 || (wc >= 0x1d1aa))))))
1106 } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1108 } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1109 ++count; /* need 2.. add one here */
1111 #if (WCHAR_MAX > 0x7fffffffL)
1112 else if (wc > 0x7fffffffL) {
1115 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1124 #else /* __UCLIBC_HAS_LOCALE__ */
1126 int wcswidth(const wchar_t *pwcs, size_t n)
1132 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1133 if (pwcs[i] != (pwcs[i] & 0x7f)) {
1138 for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1140 /* If we're here, wc != 0. */
1141 if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1154 #endif /* __UCLIBC_HAS_LOCALE__ */
1156 libc_hidden_def(wcswidth)
1159 /**********************************************************************/
1163 int wcwidth(wchar_t wc)
1165 return wcswidth(&wc, 1);
1169 /**********************************************************************/
1174 mbstate_t fromstate;
1182 int skip_invalid_input; /* To support iconv -c option. */
1185 /* For the multibyte
1186 * bit 0 means swap endian
1187 * bit 1 means 2 byte
1188 * bit 2 means 4 byte
1192 #if defined L_iconv && defined _LIBC
1193 /* Used externally only by iconv utility */
1194 extern const unsigned char __iconv_codesets[];
1195 libc_hidden_proto(__iconv_codesets)
1198 #if defined L_iconv || defined L_iconv_main
1199 const unsigned char __iconv_codesets[] =
1200 "\x0a\xe0""WCHAR_T\x00" /* superset of UCS-4 but platform-endian */
1201 #if __BYTE_ORDER == __BIG_ENDIAN
1202 "\x08\xec""UCS-4\x00" /* always BE */
1203 "\x0a\xec""UCS-4BE\x00"
1204 "\x0a\xed""UCS-4LE\x00"
1205 "\x09\xe4""UTF-32\x00" /* platform endian with BOM */
1206 "\x0b\xe4""UTF-32BE\x00"
1207 "\x0b\xe5""UTF-32LE\x00"
1208 "\x08\xe2""UCS-2\x00" /* always BE */
1209 "\x0a\xe2""UCS-2BE\x00"
1210 "\x0a\xe3""UCS-2LE\x00"
1211 "\x09\xea""UTF-16\x00" /* platform endian with BOM */
1212 "\x0b\xea""UTF-16BE\x00"
1213 "\x0b\xeb""UTF-16LE\x00"
1214 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1215 "\x08\xed""UCS-4\x00" /* always BE */
1216 "\x0a\xed""UCS-4BE\x00"
1217 "\x0a\xec""UCS-4LE\x00"
1218 "\x09\xf4""UTF-32\x00" /* platform endian with BOM */
1219 "\x0b\xe5""UTF-32BE\x00"
1220 "\x0b\xe4""UTF-32LE\x00"
1221 "\x08\xe3""UCS-2\x00" /* always BE */
1222 "\x0a\xe3""UCS-2BE\x00"
1223 "\x0a\xe2""UCS-2LE\x00"
1224 "\x09\xfa""UTF-16\x00" /* platform endian with BOM */
1225 "\x0b\xeb""UTF-16BE\x00"
1226 "\x0b\xea""UTF-16LE\x00"
1228 "\x08\x02""UTF-8\x00"
1229 "\x0b\x01""US-ASCII\x00"
1230 "\x07\x01""ASCII"; /* Must be last! (special case to save a nul) */
1232 #if defined L_iconv && defined _LIBC
1233 libc_hidden_data_def(__iconv_codesets)
1242 #include <byteswap.h>
1244 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1245 #error unsupported endianness for iconv
1248 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1249 #error currently iconv requires 8 bit locales
1251 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1252 #error currently iconv requires UTF-8 locales
1258 IC_MULTIBYTE = 0xe0,
1259 #if __BYTE_ORDER == __BIG_ENDIAN
1275 static int find_codeset(const char *name)
1277 const unsigned char *s;
1280 for (s = __iconv_codesets; *s; s += *s) {
1281 if (!strcasecmp((char*) (s + 2), name)) {
1286 /* The following is ripped from find_locale in locale.c. */
1288 /* TODO: maybe CODESET_LIST + *s ??? */
1289 /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1291 s = (const unsigned char *) __LOCALE_DATA_CODESET_LIST;
1293 ++codeset; /* Increment codeset first. */
1294 if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1299 return 0; /* No matching codeset! */
1302 iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1304 register _UC_iconv_t *px;
1305 int tocodeset, fromcodeset;
1307 if (((tocodeset = find_codeset(tocode)) != 0)
1308 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1309 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1310 px->tocodeset = tocodeset;
1311 px->tobom0 = px->tobom = (tocodeset >= 0xe0) ? (tocodeset & 0x10) >> 4 : 0;
1312 px->fromcodeset0 = px->fromcodeset = fromcodeset;
1313 px->frombom0 = px->frombom = (fromcodeset >= 0xe0) ? (fromcodeset & 0x10) >> 4 : 0;
1314 px->skip_invalid_input = px->tostate.__mask
1315 = px->fromstate.__mask = 0;
1316 return (iconv_t) px;
1319 __set_errno(EINVAL);
1321 return (iconv_t)(-1);
1324 int weak_function iconv_close(iconv_t cd)
1331 size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1332 size_t *__restrict inbytesleft,
1333 char **__restrict outbuf,
1334 size_t *__restrict outbytesleft)
1336 _UC_iconv_t *px = (_UC_iconv_t *) cd;
1341 assert(px != (_UC_iconv_t *)(-1));
1342 assert(sizeof(wchar_t) == 4);
1344 if (!inbuf || !*inbuf) { /* Need to reinitialze conversion state. */
1345 /* Note: For shift-state encodings we possibly need to output the
1346 * shift sequence to return to initial state! */
1347 if ((px->fromcodeset & 0xf0) == 0xe0) {
1349 px->tostate.__mask = px->fromstate.__mask = 0;
1350 px->fromcodeset = px->fromcodeset0;
1351 px->tobom = px->tobom0;
1352 px->frombom = px->frombom0;
1357 while (*inbytesleft) {
1358 if (!*outbytesleft) {
1365 if (px->fromcodeset >= IC_MULTIBYTE) {
1366 inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1367 if (*inbytesleft < inci) goto INVALID;
1368 wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1369 + ((unsigned char)((*inbuf)[1]));
1371 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1372 + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1373 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1375 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1376 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1377 && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1380 if (*inbytesleft < 4) goto INVALID;
1381 wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1382 + ((unsigned char)((*inbuf)[3]));
1383 if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1384 if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1387 inci = 4; /* Change inci here in case skipping illegals. */
1388 wc = 0x10000UL + (wc << 10) + wc2;
1395 || (wc == ((inci == 4)
1396 ? (((wchar_t) 0xfffe0000UL))
1397 : ((wchar_t)(0xfffeUL))))
1399 if (wc != 0xfeffU) {
1400 px->fromcodeset ^= 1; /* toggle endianness */
1404 goto BOM_SKIP_OUTPUT;
1410 if (px->fromcodeset != IC_WCHAR_T) {
1411 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1412 ? 0x7fffffffUL : 0x10ffffUL)
1414 || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1415 || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1421 } else if (px->fromcodeset == IC_UTF_8) {
1422 const char *p = *inbuf;
1423 r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1424 if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1425 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1426 assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1427 if (r == (size_t)(-2)) {
1429 __set_errno(EINVAL);
1431 px->fromstate.__mask = 0;
1434 if (px->skip_invalid_input) {
1435 px->skip_invalid_input = 2; /* flag for iconv utility */
1436 goto BOM_SKIP_OUTPUT;
1438 __set_errno(EILSEQ);
1440 return (size_t)(-1);
1442 #ifdef __UCLIBC_MJN3_ONLY__
1443 #warning TODO: optimize this.
1445 if (p != NULL) { /* incomplete char case */
1448 p = *inbuf + 1; /* nul */
1451 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1452 if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1454 } else { /* some other 8-bit ascii-extension codeset */
1455 const __codeset_8_bit_t *c8b
1456 = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1458 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
1459 (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1460 << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1475 if (px->tocodeset >= IC_MULTIBYTE) {
1476 inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1477 if (*outbytesleft < inco) goto TOO_BIG;
1478 if (px->tocodeset != IC_WCHAR_T) {
1479 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1480 ? 0x7fffffffUL : 0x10ffffUL)
1482 || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1483 || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1492 if (px->tocodeset & 1) wc = bswap_32(wc);
1494 if (((__uwchar_t)wc ) > 0xffffU) {
1495 if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1498 if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1499 wc2 = 0xdc00U + (wc & 0x3ff);
1500 wc = 0xd800U + ((wc >> 10) & 0x3ff);
1501 if (px->tocodeset & 1) {
1503 wc2 = bswap_16(wc2);
1506 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1508 (*outbuf)[0] = (char)((unsigned char)(wc));
1509 (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1511 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1512 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1514 } else if (px->tocodeset == IC_UTF_8) {
1515 const wchar_t *pw = &wc;
1517 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1518 if (r != (size_t)(-1)) {
1519 #ifdef __UCLIBC_MJN3_ONLY__
1520 #warning TODO: What happens for a nul?
1534 } else if (((__uwchar_t)(wc)) < 0x80) {
1538 if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1539 const __codeset_8_bit_t *c8b
1540 = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1542 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1543 u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1544 + ((wc >> Cwc2c_TT_SHIFT)
1545 & ((1 << Cwc2c_TI_SHIFT)-1))];
1546 wc = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
1547 + (u << Cwc2c_TT_SHIFT)
1548 + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1558 *outbytesleft -= inco;
1561 *inbytesleft -= inci;