OSDN Git Service

"make utils" now successfully makes utils for target
[uclinux-h8/uClibc.git] / libc / misc / wchar / wchar.c
1
2 /*  Copyright (C) 2002, 2003, 2004     Manuel Novoa III
3  *
4  *  This library is free software; you can redistribute it and/or
5  *  modify it under the terms of the GNU Library General Public
6  *  License as published by the Free Software Foundation; either
7  *  version 2 of the License, or (at your option) any later version.
8  *
9  *  This library is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  *  Library General Public License for more details.
13  *
14  *  You should have received a copy of the GNU Library General Public
15  *  License along with this library; if not, write to the Free
16  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17  */
18
19 /*  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!
20  *
21  *  Besides uClibc, I'm using this code in my libc for elks, which is
22  *  a 16-bit environment with a fairly limited compiler.  It would make
23  *  things much easier for me if this file isn't modified unnecessarily.
24  *  In particular, please put any new or replacement functions somewhere
25  *  else, and modify the makefile to use your version instead.
26  *  Thanks.  Manuel
27  *
28  *  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION! */
29
30
31 /* May 23, 2002     Initial Notes:
32  *
33  * I'm still tweaking this stuff, but it passes the tests I've thrown
34  * at it, and Erik needs it for the gcc port.  The glibc extension
35  * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
36  * in the glibc source.  I also need to fix the behavior of
37  * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
38  *
39  * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
40  * file on my platform (x86) show about 5-10% faster conversion speed than
41  * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
42  * individual mbrtowc()/wcrtomb() calls.
43  *
44  * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
45  * as a fail-safe UTF-8 decoder appropriate for a terminal, etc.  which
46  * needs to deal gracefully with whatever is sent to it.  In that mode,
47  * it passes Markus Kuhn's UTF-8-test.txt stress test.  I plan to add
48  * an arg to force that behavior, so the interface will be changing.
49  *
50  * I need to fix the error checking for 16-bit wide chars.  This isn't
51  * an issue for uClibc, but may be for ELKS.  I'm currently not sure
52  * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
53  *
54  * July 1, 2002
55  *
56  * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
57  * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
58  *    locales.
59  * Enabled building of a C/POSIX-locale-only version, so full locale support
60  *    no longer needs to be enabled.
61  *
62  * Nov 4, 2002
63  *
64  * Fixed a bug in _wchar_wcsntoutf8s().  Don't store wcs position if dst is NULL.
65  * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
66  *   order to support %ls in printf.  See comments below for details.
67  * Change behaviour of wc<->mb functions when in the C locale.  Now they do
68  *   a 1-1 map for the range 0x80-UCHAR_MAX.  This is for backwards compatibility
69  *   and consistency with the stds requirements that a printf format string by
70  *   a valid multibyte string beginning and ending in it's initial shift state.
71  *
72  * Nov 5, 2002
73  *
74  * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
75  *
76  * Nov 7, 2002
77  *
78  * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
79  *   Added some size/speed optimizations and integrated it into my locale
80  *   framework.  Minimally tested at the moment, but the stub C-locale
81  *   version (which most people would probably be using) should be fine.
82  *
83  * Nov 21, 2002
84  *
85  * Revert the wc<->mb changes from earlier this month involving the C-locale.
86  * Add a couple of ugly hacks to support *wprintf.
87  * Add a mini iconv() and iconv implementation (requires locale support).
88  *
89  * Aug 1, 2003
90  * Bug fix for mbrtowc.
91  *
92  * Aug 18, 2003
93  * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
94  *
95  * Feb 11, 2004
96  * Bug fix: Fix size check for remaining output space in iconv().
97  *
98  * Manuel
99  */
100
101 #include <errno.h>
102 #include <stddef.h>
103 #include <limits.h>
104 #include <stdint.h>
105 #include <inttypes.h>
106 #include <stdlib.h>
107 #include <stdio.h>
108 #include <assert.h>
109 #include <locale.h>
110 #include <wchar.h>
111 #include <bits/uClibc_uwchar.h>
112
113 /**********************************************************************/
114 #ifdef __UCLIBC_HAS_LOCALE__
115 #ifdef __UCLIBC_MJN3_ONLY__
116 #ifdef L_iswspace
117 /* generates one warning */
118 #warning TODO: Fix Cc2wc* and Cwc2c* defines!
119 #endif
120 #endif /* __UCLIBC_MJN3_ONLY__ */
121
122 #define ENCODING                (__UCLIBC_CURLOCALE->encoding)
123
124 #define Cc2wc_IDX_SHIFT         __LOCALE_DATA_Cc2wc_IDX_SHIFT
125 #define Cc2wc_ROW_LEN           __LOCALE_DATA_Cc2wc_ROW_LEN
126 #define Cwc2c_DOMAIN_MAX        __LOCALE_DATA_Cwc2c_DOMAIN_MAX
127 #define Cwc2c_TI_SHIFT          __LOCALE_DATA_Cwc2c_TI_SHIFT
128 #define Cwc2c_TT_SHIFT          __LOCALE_DATA_Cwc2c_TT_SHIFT
129 #define Cwc2c_TI_LEN            __LOCALE_DATA_Cwc2c_TI_LEN
130
131 #ifndef __CTYPE_HAS_UTF_8_LOCALES
132 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
133 #endif
134
135 #else  /* __UCLIBC_HAS_LOCALE__ */
136
137 #ifdef __UCLIBC_MJN3_ONLY__
138 #ifdef L_btowc
139 /* emit only once */
140 #warning fix preprocessor logic testing locale settings
141 #endif
142 #endif
143
144 #define ENCODING (__ctype_encoding_7_bit)
145 #ifdef __CTYPE_HAS_8_BIT_LOCALES
146 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
147 #endif
148 #ifdef __CTYPE_HAS_UTF_8_LOCALES
149 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
150 #endif
151 #undef L__wchar_utf8sntowcs
152 #undef L__wchar_wcsntoutf8s
153
154 #endif /* __UCLIBC_HAS_LOCALE__ */
155 /**********************************************************************/
156
157 #if WCHAR_MAX > 0xffffUL
158 #define UTF_8_MAX_LEN 6
159 #else
160 #define UTF_8_MAX_LEN 3
161 #endif
162
163 #define KUHN 1
164
165 /* Implementation-specific work functions. */
166
167 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
168                                         const char **__restrict src, size_t n,
169                                         mbstate_t *ps, int allow_continuation) attribute_hidden;
170
171 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
172                                         const wchar_t **__restrict src, size_t wn) attribute_hidden;
173
174 /**********************************************************************/
175 #ifdef L_btowc
176
177 /* libc_hidden_proto(mbrtowc) */
178
179 /* libc_hidden_proto(btowc) */
180 wint_t btowc(int c)
181 {
182 #ifdef __CTYPE_HAS_8_BIT_LOCALES
183
184         wchar_t wc;
185         unsigned char buf[1];
186         mbstate_t mbstate;
187
188         if (c != EOF) {
189                 *buf = (unsigned char) c;
190                 mbstate.__mask = 0;             /* Initialize the mbstate. */
191                 if (mbrtowc(&wc, (char*) buf, 1, &mbstate) <= 1) {
192                         return wc;
193                 }
194         }
195         return WEOF;
196
197 #else  /* !__CTYPE_HAS_8_BIT_LOCALES */
198
199 #ifdef __UCLIBC_HAS_LOCALE__
200         assert((ENCODING == __ctype_encoding_7_bit)
201                    || (ENCODING == __ctype_encoding_utf8));
202 #endif
203
204         /* If we don't have 8-bit locale support, then this is trivial since
205          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
206         return (((unsigned int)c) < 0x80) ? c : WEOF;
207
208 #endif /* !__CTYPE_HAS_8_BIT_LOCALES */
209 }
210 libc_hidden_def(btowc)
211
212 #endif
213 /**********************************************************************/
214 #ifdef L_wctob
215
216 /* Note: We completely ignore ps in all currently supported conversions. */
217
218 /* libc_hidden_proto(wcrtomb) */
219
220 int wctob(wint_t c)
221 {
222 #ifdef __CTYPE_HAS_8_BIT_LOCALES
223
224         unsigned char buf[MB_LEN_MAX];
225
226         return (wcrtomb((char*) buf, c, NULL) == 1) ? *buf : EOF;
227
228 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
229
230 #ifdef __UCLIBC_HAS_LOCALE__
231         assert((ENCODING == __ctype_encoding_7_bit)
232                    || (ENCODING == __ctype_encoding_utf8));
233 #endif /* __UCLIBC_HAS_LOCALE__ */
234
235         /* If we don't have 8-bit locale support, then this is trivial since
236          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
237
238         /* TODO: need unsigned version of wint_t... */
239 /*      return (((unsigned int)c) < 0x80) ? c : WEOF; */
240         return ((c >= 0) && (c < 0x80)) ? c : EOF;
241
242 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
243 }
244
245 #endif
246 /**********************************************************************/
247 #ifdef L_mbsinit
248
249 /* libc_hidden_proto(mbsinit) */
250 int mbsinit(const mbstate_t *ps)
251 {
252         return !ps || !ps->__mask;
253 }
254 libc_hidden_def(mbsinit)
255
256 #endif
257 /**********************************************************************/
258 #ifdef L_mbrlen
259
260 /* libc_hidden_proto(mbrtowc) */
261
262 /* libc_hidden_proto(mbrlen) */
263 size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
264 {
265         static mbstate_t mbstate;       /* Rely on bss 0-init. */
266
267         return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
268 }
269 libc_hidden_def(mbrlen)
270
271 #endif
272 /**********************************************************************/
273 #ifdef L_mbrtowc
274
275 /* libc_hidden_proto(mbsnrtowcs) */
276
277 /* libc_hidden_proto(mbrtowc) */
278 size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
279                            size_t n, mbstate_t *__restrict ps)
280 {
281         static mbstate_t mbstate;       /* Rely on bss 0-init. */
282         wchar_t wcbuf[1];
283         const char *p;
284         size_t r;
285         char empty_string[1];           /* Avoid static to be fPIC friendly. */
286
287         if (!ps) {
288                 ps = &mbstate;
289         }
290
291         if (!s) {
292                 pwc = (wchar_t *) s;    /* NULL */
293                 empty_string[0] = 0;    /* Init the empty string when necessary. */
294                 s = empty_string;
295                 n = 1;
296         } else if (*s == '\0') {
297     /* According to the ISO C 89 standard this is the expected behaviour.  */
298                 return 0;
299         } else if (!n) {
300                 /* TODO: change error code? */
301 #if 0
302                 return (ps->__mask && (ps->__wc == 0xffffU))
303                         ? ((size_t) -1) : ((size_t) -2);
304 #else
305                 return 0;
306 #endif
307         }
308
309         p = s;
310
311 #ifdef __CTYPE_HAS_UTF_8_LOCALES
312         /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
313         if (ENCODING == __ctype_encoding_utf8) {
314                 if (!pwc) {
315                         pwc = wcbuf;
316                 }
317                 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
318                 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
319         }
320 #endif
321
322 #ifdef __UCLIBC_MJN3_ONLY__
323 #warning TODO: This adds a trailing nul!
324 #endif /* __UCLIBC_MJN3_ONLY__ */
325
326         r = mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
327
328         if (((ssize_t) r) >= 0) {
329                 if (pwc) {
330                         *pwc = *wcbuf;
331                 }
332         }
333         return (size_t) r;
334 }
335 libc_hidden_def(mbrtowc)
336
337 #endif
338 /**********************************************************************/
339 #ifdef L_wcrtomb
340
341 /* libc_hidden_proto(wcsnrtombs) */
342
343 /* Note: We completely ignore ps in all currently supported conversions. */
344 /* TODO: Check for valid state anyway? */
345
346 /* libc_hidden_proto(wcrtomb) */
347 size_t wcrtomb(register char *__restrict s, wchar_t wc,
348                            mbstate_t *__restrict ps)
349 {
350 #ifdef __UCLIBC_MJN3_ONLY__
351 #warning TODO: Should wcsnrtombs nul-terminate unconditionally?  Check glibc.
352 #endif /* __UCLIBC_MJN3_ONLY__ */
353         wchar_t wcbuf[1];
354         const wchar_t *pwc;
355         size_t r;
356         char buf[MB_LEN_MAX];
357
358         if (!s) {
359                 s = buf;
360                 wc = 0;
361         }
362
363         pwc = wcbuf;
364         wcbuf[0] = wc;
365
366         r = wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
367         return (r != 0) ? r : 1;
368 }
369 libc_hidden_def(wcrtomb)
370
371 #endif
372 /**********************************************************************/
373 #ifdef L_mbsrtowcs
374
375 /* libc_hidden_proto(mbsnrtowcs) */
376
377 /* libc_hidden_proto(mbsrtowcs) */
378 size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
379                                  size_t len, mbstate_t *__restrict ps)
380 {
381         static mbstate_t mbstate;       /* Rely on bss 0-init. */
382
383         return mbsnrtowcs(dst, src, SIZE_MAX, len,
384                                                 ((ps != NULL) ? ps : &mbstate));
385 }
386 libc_hidden_def(mbsrtowcs)
387
388 #endif
389 /**********************************************************************/
390 #ifdef L_wcsrtombs
391
392 /* Note: We completely ignore ps in all currently supported conversions.
393
394  * TODO: Check for valid state anyway? */
395
396 /* libc_hidden_proto(wcsnrtombs) */
397
398 /* libc_hidden_proto(wcsrtombs) */
399 size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
400                                  size_t len, mbstate_t *__restrict ps)
401 {
402         return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
403 }
404 libc_hidden_def(wcsrtombs)
405
406 #endif
407 /**********************************************************************/
408 #ifdef L__wchar_utf8sntowcs
409
410 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
411  * UTF-8-test.txt strss test.
412  */
413 /*  #define DECODER */
414
415 #ifdef DECODER
416 #ifndef KUHN
417 #define KUHN
418 #endif
419 #endif
420
421 size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
422                                                   const char **__restrict src, size_t n,
423                                                   mbstate_t *ps, int allow_continuation)
424 {
425         register const char *s;
426         __uwchar_t mask;
427         __uwchar_t wc;
428         wchar_t wcbuf[1];
429         size_t count;
430         int incr;
431
432         s = *src;
433
434         assert(s != NULL);
435         assert(ps != NULL);
436
437         incr = 1;
438         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
439          * wprintf, we need to be able to compute the number of wchars needed
440          * for the mbs conversion, not to exceed the precision specified.
441          * But if dst is NULL, the return value is the length assuming a
442          * sufficiently sized buffer.  So, we allow passing of (wchar_t *) ps
443          * as pwc in order to flag that we really want the length, subject
444          * to the restricted buffer size and no partial conversions.
445          * See mbsnrtowcs() as well. */
446         if (!pwc || (pwc == ((wchar_t *)ps))) {
447                 if (!pwc) {
448                         wn = SIZE_MAX;
449                 }
450                 pwc = wcbuf;
451                 incr = 0;
452         }
453
454         /* This is really here only to support the glibc extension function
455          * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
456          * check on the validity of the mbstate. */
457         if (!(count = wn)) {
458                 return 0;
459         }
460
461         if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
462 #ifdef DECODER
463                 wc = (__uwchar_t) ps->__wc;
464                 if (n) {
465                         goto CONTINUE;
466                 }
467                 goto DONE;
468 #else
469                 if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
470                         /* TODO: change error code here and below? */
471                         if (n) {
472                                 goto CONTINUE;
473                         }
474                         goto DONE;
475                 }
476                 __set_errno(EILSEQ);
477                 return (size_t) -1;             /* We're in an error state. */
478 #endif
479         }
480
481         do {
482                 if (!n) {
483                         goto DONE;
484                 }
485                 --n;
486                 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
487                         mask = 0x40;
488 #ifdef __UCLIBC_MJN3_ONLY__
489 #warning TODO: Fix range for 16 bit wchar_t case.
490 #endif
491                         if (( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) &&
492                         (((unsigned char)s[-1] != 0xc0 ) && ((unsigned char)s[-1] != 0xc1 ))) {
493                                 goto START;
494                         }
495                 BAD:
496 #ifdef DECODER
497                         wc = 0xfffdU;
498                         goto COMPLETE;
499 #else
500                         ps->__mask = mask;
501                         ps->__wc = 0xffffU;
502                         __set_errno(EILSEQ);
503                         return (size_t) -1;     /* Illegal start byte! */
504 #endif
505
506                 CONTINUE:
507                         while (n) {
508                                 --n;
509                                 if ((*s & 0xc0) != 0x80) {
510                                         goto BAD;
511                                 }
512                                 mask <<= 5;
513                                 wc <<= 6;
514                                 wc += (*s & 0x3f);      /* keep seperate for bcc (smaller code) */
515                                 ++s;
516                         START:
517                                 wc &= ~(mask << 1);
518
519                                 if ((wc & mask) == 0) { /* Character completed. */
520                                         if ((mask >>= 5) == 0x40) {
521                                                 mask += mask;
522                                         }
523                                         /* Check for invalid sequences (longer than necessary)
524                                          * and invalid chars.  */
525                                         if ( (wc < mask) /* Sequence not minimal length. */
526 #ifdef KUHN
527 #if UTF_8_MAX_LEN == 3
528 #error broken since mask can overflow!!
529                                                  /* For plane 0, these are the only defined values.*/
530                                                  || (wc > 0xfffdU)
531 #else
532                                                  /* Note that we don't need to worry about exceeding */
533                                                  /* 31 bits as that is the most that UTF-8 provides. */
534                                                  || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
535 #endif
536                                                  || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
537 #endif /* KUHN */
538                                                  ) {
539                                                 goto BAD;
540                                         }
541                                         goto COMPLETE;
542                                 }
543                         }
544                         /* Character potentially valid but incomplete. */
545                         if (!allow_continuation) {
546                                 if (count != wn) {
547                                         return 0;
548                                 }
549                                 /* NOTE: The following can fail if you allow and then disallow
550                                  * continuation!!! */
551 #if UTF_8_MAX_LEN == 3
552 #error broken since mask can overflow!!
553 #endif
554                                 /* Need to back up... */
555                                 do {
556                                         --s;
557                                 } while ((mask >>= 5) >= 0x40);
558                                 goto DONE;
559                         }
560                         ps->__mask = (wchar_t) mask;
561                         ps->__wc = (wchar_t) wc;
562                         *src = s;
563                         return (size_t) -2;
564                 }
565         COMPLETE:
566                 *pwc = wc;
567                 pwc += incr;
568         }
569 #ifdef DECODER
570         while (--count);
571 #else
572         while (wc && --count);
573
574         if (!wc) {
575                 s = NULL;
576         }
577 #endif
578
579  DONE:
580         /* ps->__wc is irrelavent here. */
581         ps->__mask = 0;
582         if (pwc != wcbuf) {
583                 *src = s;
584         }
585
586         return wn - count;
587 }
588
589 #endif
590 /**********************************************************************/
591 #ifdef L__wchar_wcsntoutf8s
592
593 size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
594                                                   const wchar_t **__restrict src, size_t wn)
595 {
596         register char *p;
597         size_t len, t;
598         __uwchar_t wc;
599         const __uwchar_t *swc;
600         int store;
601         char buf[MB_LEN_MAX];
602         char m;
603
604         store = 1;
605         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
606          * printf, we need to be able to compute the number of bytes needed
607          * for the mbs conversion, not to exceed the precision specified.
608          * But if dst is NULL, the return value is the length assuming a
609          * sufficiently sized buffer.  So, we allow passing of (char *) src
610          * as dst in order to flag that we really want the length, subject
611          * to the restricted buffer size and no partial conversions.
612          * See wcsnrtombs() as well. */
613         if (!s || (s == ((char *) src))) {
614                 if (!s) {
615                         n = SIZE_MAX;
616                 }
617             s = buf;
618                 store = 0;
619         }
620
621         t = n;
622         swc = (const __uwchar_t *) *src;
623
624         assert(swc != NULL);
625
626         while (wn && t) {
627                 wc = *swc;
628
629                 *s = wc;
630                 len = 1;
631
632                 if (wc >= 0x80) {
633 #ifdef KUHN
634                         if (
635 #if UTF_8_MAX_LEN == 3
636                                 /* For plane 0, these are the only defined values.*/
637                                 /* Note that we don't need to worry about exceeding */
638                                 /* 31 bits as that is the most that UTF-8 provides. */
639                                 (wc > 0xfffdU)
640 #else
641                                 /* UTF_8_MAX_LEN == 6 */
642                                 (wc > 0x7fffffffUL)
643                                 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
644 #endif
645                                 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
646                                 ) {
647                                 __set_errno(EILSEQ);
648                                 return (size_t) -1;
649                         }
650 #else  /* KUHN */
651 #if UTF_8_MAX_LEN != 3
652                         if (wc > 0x7fffffffUL) { /* Value too large. */
653                                 __set_errno(EILSEQ);
654                                 return (size_t) -1;
655                         }
656 #endif
657 #endif /* KUHN */
658
659                         wc >>= 1;
660                         p = s;
661                         do {
662                                 ++p;
663                         } while (wc >>= 5);
664                         wc = *swc;
665                         if ((len = p - s) > t) { /* Not enough space. */
666                                 break;
667                         }
668
669                         m = 0x80;
670                         while( p>s ) {
671                                 m = (m >> 1) | 0x80;
672                                 *--p = (wc & 0x3f) | 0x80;
673                                 wc >>= 6;
674                         }
675                         *s |= (m << 1);
676                 } else if (wc == 0) {   /* End of string. */
677                         swc = NULL;
678                         break;
679                 }
680
681                 ++swc;
682                 --wn;
683                 t -= len;
684                 if (store) {
685                         s += len;
686                 }
687         }
688
689         if (store) {
690                 *src = (const wchar_t *) swc;
691         }
692
693         return n - t;
694 }
695
696
697 #endif
698 /**********************************************************************/
699 #ifdef L_mbsnrtowcs
700
701 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
702
703 /* libc_hidden_proto(mbsnrtowcs) */
704 size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
705                                         size_t NMC, size_t len, mbstate_t *__restrict ps)
706 {
707         static mbstate_t mbstate;       /* Rely on bss 0-init. */
708         wchar_t wcbuf[1];
709         const char *s;
710         size_t count;
711         int incr;
712
713         if (!ps) {
714                 ps = &mbstate;
715         }
716
717 #ifdef __CTYPE_HAS_UTF_8_LOCALES
718         if (ENCODING == __ctype_encoding_utf8) {
719                 size_t r;
720                 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
721                                 != (size_t) -2) ? r : 0;
722         }
723 #endif
724         incr = 1;
725         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
726          * wprintf, we need to be able to compute the number of wchars needed
727          * for the mbs conversion, not to exceed the precision specified.
728          * But if dst is NULL, the return value is the length assuming a
729          * sufficiently sized buffer.  So, we allow passing of ((wchar_t *)ps)
730          * as dst in order to flag that we really want the length, subject
731          * to the restricted buffer size and no partial conversions.
732          * See _wchar_utf8sntowcs() as well. */
733         if (!dst || (dst == ((wchar_t *)ps))) {
734                 if (!dst) {
735                         len = SIZE_MAX;
736                 }
737                 dst = wcbuf;
738                 incr = 0;
739         }
740
741         /* Since all the following encodings are single-byte encodings... */
742         if (len > NMC) {
743                 len = NMC;
744         }
745
746         count = len;
747         s = *src;
748
749 #ifdef __CTYPE_HAS_8_BIT_LOCALES
750         if (ENCODING == __ctype_encoding_8_bit) {
751                 wchar_t wc;
752                 while (count) {
753                         if ((wc = ((unsigned char)(*s))) >= 0x80) {     /* Non-ASCII... */
754                                 wc -= 0x80;
755                                 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
756                                                   (__UCLIBC_CURLOCALE->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
757                                                    << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
758                                 if (!wc) {
759                                         goto BAD;
760                                 }
761                         }
762                         if (!(*dst = wc)) {
763                                 s = NULL;
764                                 break;
765                         }
766                         dst += incr;
767                         ++s;
768                         --count;
769                 }
770                 if (dst != wcbuf) {
771                         *src = s;
772                 }
773                 return len - count;
774         }
775 #endif
776
777 #ifdef __UCLIBC_HAS_LOCALE__
778         assert(ENCODING == __ctype_encoding_7_bit);
779 #endif
780
781         while (count) {
782                 if ((*dst = (unsigned char) *s) == 0) {
783                         s = NULL;
784                         break;
785                 }
786                 if (*dst >= 0x80) {
787 #ifdef __CTYPE_HAS_8_BIT_LOCALES
788                 BAD:
789 #endif
790                         __set_errno(EILSEQ);
791                         return (size_t) -1;
792                 }
793                 ++s;
794                 dst += incr;
795                 --count;
796         }
797         if (dst != wcbuf) {
798                 *src = s;
799         }
800         return len - count;
801 }
802 libc_hidden_def(mbsnrtowcs)
803
804 #endif
805 /**********************************************************************/
806 #ifdef L_wcsnrtombs
807
808 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
809
810 /* Note: We completely ignore ps in all currently supported conversions.
811  * TODO: Check for valid state anyway? */
812
813 /* libc_hidden_proto(wcsnrtombs) */
814 size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
815                                         size_t NWC, size_t len, mbstate_t *__restrict ps)
816 {
817         const __uwchar_t *s;
818         size_t count;
819         int incr;
820         char buf[MB_LEN_MAX];
821
822 #ifdef __CTYPE_HAS_UTF_8_LOCALES
823         if (ENCODING == __ctype_encoding_utf8) {
824                 return _wchar_wcsntoutf8s(dst, len, src, NWC);
825         }
826 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
827
828         incr = 1;
829         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
830          * printf, we need to be able to compute the number of bytes needed
831          * for the mbs conversion, not to exceed the precision specified.
832          * But if dst is NULL, the return value is the length assuming a
833          * sufficiently sized buffer.  So, we allow passing of (char *) src
834          * as dst in order to flag that we really want the length, subject
835          * to the restricted buffer size and no partial conversions.
836          * See _wchar_wcsntoutf8s() as well. */
837         if (!dst || (dst == ((char *) src))) {
838                 if (!dst) {
839                         len = SIZE_MAX;
840                 }
841                 dst = buf;
842                 incr = 0;
843         }
844
845         /* Since all the following encodings are single-byte encodings... */
846         if (len > NWC) {
847                 len = NWC;
848         }
849
850         count = len;
851         s = (const __uwchar_t *) *src;
852
853 #ifdef __CTYPE_HAS_8_BIT_LOCALES
854         if (ENCODING == __ctype_encoding_8_bit) {
855                 __uwchar_t wc;
856                 __uwchar_t u;
857                 while (count) {
858                         if ((wc = *s) <= 0x7f) {
859                                 if (!(*dst = (unsigned char) wc)) {
860                                         s = NULL;
861                                         break;
862                                 }
863                         } else {
864                                 u = 0;
865                                 if (wc <= Cwc2c_DOMAIN_MAX) {
866                                         u = __UCLIBC_CURLOCALE->idx8wc2c[wc >> (Cwc2c_TI_SHIFT
867                                                                                                                 + Cwc2c_TT_SHIFT)];
868                                         u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
869                                                                         + ((wc >> Cwc2c_TT_SHIFT)
870                                                                            & ((1 << Cwc2c_TI_SHIFT)-1))];
871                                         u = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
872                                                                         + (u << Cwc2c_TT_SHIFT)
873                                                                         + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
874                                 }
875
876 #ifdef __WCHAR_REPLACEMENT_CHAR
877                                 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
878 #else  /* __WCHAR_REPLACEMENT_CHAR */
879                                 if (!u) {
880                                         goto BAD;
881                                 }
882                                 *dst = (unsigned char) u;
883 #endif /* __WCHAR_REPLACEMENT_CHAR */
884                         }
885                         ++s;
886                         dst += incr;
887                         --count;
888                 }
889                 if (dst != buf) {
890                         *src = (const wchar_t *) s;
891                 }
892                 return len - count;
893         }
894 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
895
896 #ifdef __UCLIBC_HAS_LOCALE__
897         assert(ENCODING == __ctype_encoding_7_bit);
898 #endif
899
900         while (count) {
901                 if (*s >= 0x80) {
902 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
903                 BAD:
904 #endif
905                         __set_errno(EILSEQ);
906                         return (size_t) -1;
907                 }
908                 if ((*dst = (unsigned char) *s) == 0) {
909                         s = NULL;
910                         break;
911                 }
912                 ++s;
913                 dst += incr;
914                 --count;
915         }
916         if (dst != buf) {
917                 *src = (const wchar_t *) s;
918         }
919         return len - count;
920 }
921 libc_hidden_def(wcsnrtombs)
922
923 #endif
924 /**********************************************************************/
925 #ifdef L_wcswidth
926
927 /* libc_hidden_proto(wcswidth) */
928
929 #ifdef __UCLIBC_MJN3_ONLY__
930 #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
931 #warning TODO: Update wcwidth to match latest by Kuhn.
932 #endif
933
934 #if defined(__UCLIBC_HAS_LOCALE__) && \
935 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
936
937 static const unsigned char new_idx[] = {
938         0,    5,    5,    6,   10,   15,   28,   39,
939         48,   48,   71,   94,  113,  128,  139,  154,
940         175,  186,  188,  188,  188,  188,  188,  188,
941         203,  208,  208,  208,  208,  208,  208,  208,
942         208,  219,  219,  219,  222,  222,  222,  222,
943         222,  222,  222,  222,  222,  222,  222,  224,
944         224,  231,  231,  231,  231,  231,  231,  231,
945         231,  231,  231,  231,  231,  231,  231,  231,
946         231,  231,  231,  231,  231,  231,  231,  231,
947         231,  231,  231,  231,  231,  231,  231,  231,
948         231,  231,  231,  231,  231,  231,  231,  231,
949         231,  231,  231,  231,  231,  231,  231,  231,
950         231,  231,  231,  231,  231,  231,  231,  231,
951         231,  231,  231,  231,  231,  231,  231,  231,
952         231,  231,  231,  231,  231,  231,  231,  231,
953         231,  231,  231,  231,  231,  231,  231,  231,
954         231,  231,  231,  231,  231,  231,  231,  231,
955         231,  231,  231,  231,  231,  231,  231,  231,
956         231,  231,  231,  231,  231,  231,  231,  231,
957         231,  231,  231,  231,  231,  231,  231,  231,
958         231,  231,  231,  231,  231,  233,  233,  233,
959         233,  233,  233,  233,  234,  234,  234,  234,
960         234,  234,  234,  234,  234,  234,  234,  234,
961         234,  234,  234,  234,  234,  234,  234,  234,
962         234,  234,  234,  234,  234,  234,  234,  234,
963         234,  234,  234,  234,  234,  234,  234,  234,
964         234,  234,  234,  234,  234,  234,  234,  234,
965         236,  236,  236,  236,  236,  236,  236,  236,
966         236,  236,  236,  236,  236,  236,  236,  236,
967         236,  236,  236,  236,  236,  236,  236,  236,
968         236,  236,  236,  236,  236,  236,  236,  236,
969         236,  237,  237,  238,  241,  241,  242,  249,
970         255,
971 };
972
973 static const unsigned char new_tbl[] = {
974         0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
975         0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
976         0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
977         0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
978         0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
979         0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
980         0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
981         0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
982         0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
983         0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
984         0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
985         0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
986         0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
987         0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
988         0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
989         0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
990         0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
991         0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
992         0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
993         0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
994         0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
995         0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
996         0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
997         0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
998         0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
999         0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
1000         0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
1001         0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
1002         0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
1003         0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
1004         0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
1005         0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
1006 };
1007
1008 static const signed char new_wtbl[] = {
1009         0,   -1,    1,   -1,    1,    1,    0,    1,
1010         0,    1,    1,    0,    1,    0,    1,    1,
1011         0,    1,    0,    1,    0,    1,    0,    1,
1012         0,    1,    0,    1,    1,    0,    1,    0,
1013         1,    0,    1,    0,    1,    0,    1,    1,
1014         0,    1,    0,    1,    0,    1,    0,    1,
1015         1,    0,    1,    0,    1,    0,    1,    0,
1016         1,    0,    1,    0,    1,    0,    1,    0,
1017         1,    0,    1,    0,    1,    0,    1,    1,
1018         0,    1,    0,    1,    0,    1,    0,    1,
1019         0,    1,    0,    1,    0,    1,    0,    1,
1020         0,    1,    0,    1,    0,    1,    1,    0,
1021         1,    0,    1,    0,    1,    0,    1,    0,
1022         1,    0,    1,    0,    1,    0,    1,    0,
1023         1,    1,    0,    1,    0,    1,    0,    1,
1024         0,    1,    0,    1,    0,    1,    0,    1,
1025         1,    0,    1,    0,    1,    0,    1,    0,
1026         1,    0,    1,    1,    0,    1,    0,    1,
1027         0,    1,    0,    1,    0,    1,    0,    1,
1028         0,    1,    1,    0,    1,    0,    1,    0,
1029         1,    0,    1,    0,    1,    0,    1,    0,
1030         1,    0,    1,    0,    1,    0,    1,    1,
1031         0,    1,    0,    1,    0,    1,    0,    1,
1032         0,    1,    2,    0,    1,    0,    1,    0,
1033         1,    0,    1,    0,    1,    0,    1,    0,
1034         1,    0,    1,    1,    0,    1,    0,    1,
1035         1,    0,    1,    0,    1,    0,    1,    0,
1036         1,    0,    1,    1,    2,    1,    1,    2,
1037         2,    0,    2,    1,    2,    0,    2,    2,
1038         1,    1,    2,    1,    1,    2,    1,    0,
1039         1,    1,    0,    1,    0,    1,    2,    1,
1040         0,    2,    1,    2,    1,    0,    1,
1041 };
1042
1043 /* libc_hidden_proto(wcsnrtombs) */
1044
1045 int wcswidth(const wchar_t *pwcs, size_t n)
1046 {
1047     int h, l, m, count;
1048     wchar_t wc;
1049     unsigned char b;
1050
1051         if (ENCODING == __ctype_encoding_7_bit) {
1052                 size_t i;
1053
1054                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1055                         if (pwcs[i] != (pwcs[i] & 0x7f)) {
1056                                 return -1;
1057                         }
1058                 }
1059         }
1060 #ifdef __CTYPE_HAS_8_BIT_LOCALES
1061         else if (ENCODING == __ctype_encoding_8_bit) {
1062                 mbstate_t mbstate;
1063
1064                 mbstate.__mask = 0;                     /* Initialize the mbstate. */
1065                 if (wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1066                         return -1;
1067                 }
1068         }
1069 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
1070 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1071         /* For stricter handling of allowed unicode values... see comments above. */
1072         else if (ENCODING == __ctype_encoding_utf8) {
1073                 size_t i;
1074
1075                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1076                         if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1077                                  || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1078                                 ) {
1079                                 return -1;
1080                         }
1081                 }
1082         }
1083 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1084
1085     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1086                 if (wc <= 0xff) {
1087                         /* If we're here, wc != 0. */
1088                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1089                                 return -1;
1090                         }
1091                         ++count;
1092                         continue;
1093                 }
1094                 if (((unsigned int) wc) <= 0xffff) {
1095                         b = wc & 0xff;
1096                         h = (wc >> 8);
1097                         l = new_idx[h];
1098                         h = new_idx[h+1];
1099                         while ((m = (l+h) >> 1) != l) {
1100                                 if (b >= new_tbl[m]) {
1101                                         l = m;
1102                                 } else {                /* wc < tbl[m] */
1103                                         h = m;
1104                                 }
1105                         }
1106                         count += new_wtbl[l]; /* none should be -1. */
1107                         continue;
1108                 }
1109
1110                 /* Redo this to minimize average number of compares?*/
1111                 if (wc >= 0x1d167) {
1112                         if (wc <= 0x1d1ad) {
1113                                 if ((wc <= 0x1d169
1114                                          || (wc >= 0x1d173
1115                                                  && (wc <= 0x1d182
1116                                                          || (wc >= 0x1d185
1117                                                                  && (wc <= 0x1d18b
1118                                                                          || (wc >= 0x1d1aa))))))
1119                                         ) {
1120                                         continue;
1121                                 }
1122                         } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1123                                 continue;
1124                         } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1125                                 ++count;                /* need 2.. add one here */
1126                         }
1127 #if (WCHAR_MAX > 0x7fffffffL)
1128                         else if (wc > 0x7fffffffL) {
1129                                 return -1;
1130                         }
1131 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1132                 }
1133
1134                 ++count;
1135     }
1136
1137     return count;
1138 }
1139
1140 #else  /*  __UCLIBC_HAS_LOCALE__ */
1141
1142 int wcswidth(const wchar_t *pwcs, size_t n)
1143 {
1144         int count;
1145         wchar_t wc;
1146         size_t i;
1147
1148         for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1149                 if (pwcs[i] != (pwcs[i] & 0x7f)) {
1150                         return -1;
1151                 }
1152         }
1153
1154     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1155                 if (wc <= 0xff) {
1156                         /* If we're here, wc != 0. */
1157                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1158                                 return -1;
1159                         }
1160                         ++count;
1161                         continue;
1162                 } else {
1163                         return -1;
1164                 }
1165         }
1166
1167         return count;
1168 }
1169
1170 #endif /*  __UCLIBC_HAS_LOCALE__ */
1171
1172 libc_hidden_def(wcswidth)
1173
1174 #endif
1175 /**********************************************************************/
1176 #ifdef L_wcwidth
1177
1178 /* libc_hidden_proto(wcswidth) */
1179
1180 int wcwidth(wchar_t wc)
1181 {
1182     return wcswidth(&wc, 1);
1183 }
1184
1185 #endif
1186 /**********************************************************************/
1187
1188
1189 typedef struct {
1190         mbstate_t tostate;
1191         mbstate_t fromstate;
1192         int tocodeset;
1193         int fromcodeset;
1194         int frombom;
1195         int tobom;
1196         int fromcodeset0;
1197         int frombom0;
1198         int tobom0;
1199         int skip_invalid_input;         /* To support iconv -c option. */
1200 } _UC_iconv_t;
1201
1202
1203
1204 #ifdef L_iconv
1205
1206 #include <iconv.h>
1207 #include <string.h>
1208 #include <endian.h>
1209 #include <byteswap.h>
1210
1211 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1212 #error unsupported endianness for iconv
1213 #endif
1214
1215 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1216 #error currently iconv requires 8 bit locales
1217 #endif
1218 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1219 #error currently iconv requires UTF-8 locales
1220 #endif
1221
1222
1223 enum {
1224         IC_WCHAR_T = 0xe0,
1225         IC_MULTIBYTE = 0xe0,
1226 #if __BYTE_ORDER == __BIG_ENDIAN
1227         IC_UCS_4 =      0xec,
1228         IC_UTF_32 = 0xe4,
1229         IC_UCS_2 =      0xe2,
1230         IC_UTF_16 = 0xea,
1231 #else
1232         IC_UCS_4 =      0xed,
1233         IC_UTF_32 = 0xe5,
1234         IC_UCS_2 =      0xe3,
1235         IC_UTF_16 = 0xeb,
1236 #endif
1237         IC_UTF_8 = 2,
1238         IC_ASCII = 1
1239 };
1240
1241 /* For the multibyte
1242  * bit 0 means swap endian
1243  * bit 1 means 2 byte
1244  * bit 2 means 4 byte
1245  *
1246  */
1247
1248 /* Used externally only by iconv utility */
1249 extern const unsigned char __iconv_codesets[];
1250 libc_hidden_proto(__iconv_codesets)
1251
1252 const unsigned char __iconv_codesets[] =
1253         "\x0a\xe0""WCHAR_T\x00"         /* superset of UCS-4 but platform-endian */
1254 #if __BYTE_ORDER == __BIG_ENDIAN
1255         "\x08\xec""UCS-4\x00"           /* always BE */
1256         "\x0a\xec""UCS-4BE\x00"
1257         "\x0a\xed""UCS-4LE\x00"
1258         "\x09\xe4""UTF-32\x00"          /* platform endian with BOM */
1259         "\x0b\xe4""UTF-32BE\x00"
1260         "\x0b\xe5""UTF-32LE\x00"
1261         "\x08\xe2""UCS-2\x00"           /* always BE */
1262         "\x0a\xe2""UCS-2BE\x00"
1263         "\x0a\xe3""UCS-2LE\x00"
1264         "\x09\xea""UTF-16\x00"          /* platform endian with BOM */
1265         "\x0b\xea""UTF-16BE\x00"
1266         "\x0b\xeb""UTF-16LE\x00"
1267 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1268         "\x08\xed""UCS-4\x00"           /* always BE */
1269         "\x0a\xed""UCS-4BE\x00"
1270         "\x0a\xec""UCS-4LE\x00"
1271         "\x09\xf4""UTF-32\x00"          /* platform endian with BOM */
1272         "\x0b\xe5""UTF-32BE\x00"
1273         "\x0b\xe4""UTF-32LE\x00"
1274         "\x08\xe3""UCS-2\x00"           /* always BE */
1275         "\x0a\xe3""UCS-2BE\x00"
1276         "\x0a\xe2""UCS-2LE\x00"
1277         "\x09\xfa""UTF-16\x00"          /* platform endian with BOM */
1278         "\x0b\xeb""UTF-16BE\x00"
1279         "\x0b\xea""UTF-16LE\x00"
1280 #endif
1281         "\x08\x02""UTF-8\x00"
1282         "\x0b\x01""US-ASCII\x00"
1283         "\x07\x01""ASCII";                      /* Must be last! (special case to save a nul) */
1284 libc_hidden_data_def(__iconv_codesets)
1285
1286 /* Experimentally off - libc_hidden_proto(strcasecmp) */
1287
1288 static int find_codeset(const char *name)
1289 {
1290         const unsigned char *s;
1291         int codeset;
1292
1293         for (s = __iconv_codesets; *s; s += *s) {
1294                 if (!strcasecmp((char*) (s + 2), name)) {
1295                         return s[1];
1296                 }
1297         }
1298
1299         /* The following is ripped from find_locale in locale.c. */
1300
1301         /* TODO: maybe CODESET_LIST + *s ??? */
1302         /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1303         codeset = 2;
1304         s = (const unsigned char *) __LOCALE_DATA_CODESET_LIST;
1305         do {
1306                 ++codeset;              /* Increment codeset first. */
1307                 if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1308                         return codeset;
1309                 }
1310         } while (*++s);
1311
1312         return 0;                       /* No matching codeset! */
1313 }
1314
1315 iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1316 {
1317         register _UC_iconv_t *px;
1318         int tocodeset, fromcodeset;
1319
1320         if (((tocodeset = find_codeset(tocode)) != 0)
1321                 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1322                 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1323                         px->tocodeset = tocodeset;
1324                         px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
1325                         px->fromcodeset0 = px->fromcodeset = fromcodeset;
1326                         px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
1327                         px->skip_invalid_input = px->tostate.__mask
1328                                 = px->fromstate.__mask = 0;
1329                         return (iconv_t) px;
1330                 }
1331         } else {
1332                 __set_errno(EINVAL);
1333         }
1334         return (iconv_t)(-1);
1335 }
1336
1337 int weak_function iconv_close(iconv_t cd)
1338 {
1339         free(cd);
1340
1341         return 0;
1342 }
1343
1344 size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1345                                                    size_t *__restrict inbytesleft,
1346                                                    char **__restrict outbuf,
1347                                                    size_t *__restrict outbytesleft)
1348 {
1349         _UC_iconv_t *px = (_UC_iconv_t *) cd;
1350         size_t nrcount, r;
1351         wchar_t wc, wc2;
1352         int inci, inco;
1353
1354         assert(px != (_UC_iconv_t *)(-1));
1355         assert(sizeof(wchar_t) == 4);
1356
1357         if (!inbuf || !*inbuf) {        /* Need to reinitialze conversion state. */
1358                 /* Note: For shift-state encodings we possibly need to output the
1359                  * shift sequence to return to initial state! */
1360                 if ((px->fromcodeset & 0xf0) == 0xe0) {
1361                 }
1362                 px->tostate.__mask = px->fromstate.__mask = 0;
1363                 px->fromcodeset = px->fromcodeset0;
1364                 px->tobom = px->tobom0;
1365                 px->frombom = px->frombom0;
1366                 return 0;
1367         }
1368
1369         nrcount = 0;
1370         while (*inbytesleft) {
1371                 if (!*outbytesleft) {
1372                 TOO_BIG:
1373                         __set_errno(E2BIG);
1374                         return (size_t) -1;
1375                 }
1376
1377                 inci = inco = 1;
1378                 if (px->fromcodeset >= IC_MULTIBYTE) {
1379                         inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1380                         if (*inbytesleft < inci) goto INVALID;
1381                         wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1382                                 + ((unsigned char)((*inbuf)[1]));
1383                         if (inci == 4) {
1384                                 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1385                                         + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1386                                 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1387                         } else {
1388                                 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1389                                 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1390                                          && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1391                                         ) {                     /* surrogate */
1392                                         wc =- 0xd800U;
1393                                         if (*inbytesleft < 4) goto INVALID;
1394                                         wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1395                                                 + ((unsigned char)((*inbuf)[3]));
1396                                         if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1397                                         if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1398                                                 goto ILLEGAL;
1399                                         }
1400                                         inci = 4;       /* Change inci here in case skipping illegals. */
1401                                         wc = 0x10000UL + (wc << 10) + wc2;
1402                                 }
1403                         }
1404
1405                         if (px->frombom) {
1406                                 px->frombom = 0;
1407                                 if ((wc == 0xfeffU)
1408                                         || (wc == ((inci == 4)
1409                                                            ? (((wchar_t) 0xfffe0000UL))
1410                                                            : ((wchar_t)(0xfffeUL))))
1411                                         ) {
1412                                         if (wc != 0xfeffU) {
1413                                                 px->fromcodeset ^= 1; /* toggle endianness */
1414                                                 wc = 0xfeffU;
1415                                         }
1416                                         if (!px->frombom) {
1417                                                 goto BOM_SKIP_OUTPUT;
1418                                         }
1419                                         goto GOT_BOM;
1420                                 }
1421                         }
1422
1423                         if (px->fromcodeset != IC_WCHAR_T) {
1424                                 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1425                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1426 #ifdef KUHN
1427                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1428                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1429 #endif
1430                                         ) {
1431                                         goto ILLEGAL;
1432                                 }
1433                         }
1434                 } else if (px->fromcodeset == IC_UTF_8) {
1435                         const char *p = *inbuf;
1436                         r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1437                         if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1438                                 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1439                                         assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1440                                         if (r == (size_t)(-2)) {
1441                                         INVALID:
1442                                                 __set_errno(EINVAL);
1443                                         } else {
1444                                                 px->fromstate.__mask = 0;
1445                                                 inci = 1;
1446                                         ILLEGAL:
1447                                                 if (px->skip_invalid_input) {
1448                                                         px->skip_invalid_input = 2;     /* flag for iconv utility */
1449                                                         goto BOM_SKIP_OUTPUT;
1450                                                 }
1451                                                 __set_errno(EILSEQ);
1452                                         }
1453                                         return (size_t)(-1);
1454                                 }
1455 #ifdef __UCLIBC_MJN3_ONLY__
1456 #warning TODO: optimize this.
1457 #endif
1458                                 if (p != NULL) { /* incomplete char case */
1459                                         goto INVALID;
1460                                 }
1461                                 p = *inbuf + 1; /* nul */
1462                         }
1463                         inci = p - *inbuf;
1464                 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1465                         if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1466                                 goto ILLEGAL;
1467                         } else {                        /* some other 8-bit ascii-extension codeset */
1468                                 const __codeset_8_bit_t *c8b
1469                                         = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1470                                 wc -= 0x80;
1471                                 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
1472                                                          (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1473                                                           << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1474                                 if (!wc) {
1475                                         goto ILLEGAL;
1476                                 }
1477                         }
1478                 }
1479
1480
1481                 if (px->tobom) {
1482                         inci = 0;
1483                         wc = 0xfeffU;
1484         GOT_BOM:
1485                         px->tobom = 0;
1486                 }
1487
1488                 if (px->tocodeset >= IC_MULTIBYTE) {
1489                         inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1490                         if (*outbytesleft < inco) goto TOO_BIG;
1491                         if (px->tocodeset != IC_WCHAR_T) {
1492                                 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1493                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1494 #ifdef KUHN
1495                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1496                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1497 #endif
1498                                         ) {
1499                                 REPLACE_32:
1500                                         wc = 0xfffd;
1501                                         ++nrcount;
1502                                 }
1503                         }
1504                         if (inco == 4) {
1505                                 if (px->tocodeset & 1) wc = bswap_32(wc);
1506                         } else {
1507                                 if (((__uwchar_t)wc ) > 0xffffU) {
1508                                         if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1509                                                 goto REPLACE_32;
1510                                         }
1511                                         if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1512                                         wc2 = 0xdc00U + (wc & 0x3ff);
1513                                         wc = 0xd800U + ((wc >> 10) & 0x3ff);
1514                                         if (px->tocodeset & 1) {
1515                                                 wc = bswap_16(wc);
1516                                                 wc2 = bswap_16(wc2);
1517                                         }
1518                                         wc += (wc2 << 16);
1519                                 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1520                         }
1521                         (*outbuf)[0] = (char)((unsigned char)(wc));
1522                         (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1523                         if (inco == 4) {
1524                                 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1525                                 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1526                         }
1527                 } else if (px->tocodeset == IC_UTF_8) {
1528                         const wchar_t *pw = &wc;
1529                         do {
1530                                 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1531                                 if (r != (size_t)(-1)) {
1532 #ifdef __UCLIBC_MJN3_ONLY__
1533 #warning TODO: What happens for a nul?
1534 #endif
1535                                         if (r == 0) {
1536                                                 if (wc != 0) {
1537                                                         goto TOO_BIG;
1538                                                 }
1539                                                 ++r;
1540                                         }
1541                                         break;
1542                                 }
1543                                 wc = 0xfffdU;
1544                                 ++nrcount;
1545                         } while (1);
1546                         inco = r;
1547                 } else if (((__uwchar_t)(wc)) < 0x80) {
1548                 CHAR_GOOD:
1549                                 **outbuf = wc;
1550                 } else {
1551                         if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1552                                 const __codeset_8_bit_t *c8b
1553                                         = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1554                                 __uwchar_t u;
1555                                 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1556                                 u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1557                                                  + ((wc >> Cwc2c_TT_SHIFT)
1558                                                         & ((1 << Cwc2c_TI_SHIFT)-1))];
1559                                 wc = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
1560                                                  + (u << Cwc2c_TT_SHIFT)
1561                                                  + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1562                                 if (wc) {
1563                                         goto CHAR_GOOD;
1564                                 }
1565                         }
1566                         **outbuf = '?';
1567                         ++nrcount;
1568                 }
1569
1570                 *outbuf += inco;
1571                 *outbytesleft -= inco;
1572         BOM_SKIP_OUTPUT:
1573                 *inbuf += inci;
1574                 *inbytesleft -= inci;
1575         }
1576         return nrcount;
1577 }
1578
1579 #endif
1580 /**********************************************************************/
1581 #ifdef L_iconv_main
1582
1583 #include <string.h>
1584 #include <iconv.h>
1585 #include <stdarg.h>
1586 #include <libgen.h>
1587
1588 extern const unsigned char __iconv_codesets[];
1589
1590 #define IBUF BUFSIZ
1591 #define OBUF BUFSIZ
1592
1593 static char *progname;
1594 static int hide_errors;
1595
1596 static void error_msg(const char *fmt, ...)
1597          __attribute__ ((noreturn, format (printf, 1, 2)));
1598
1599 static void error_msg(const char *fmt, ...)
1600 {
1601         va_list arg;
1602
1603         if (!hide_errors) {
1604                 fprintf(stderr, "%s: ", progname);
1605                 va_start(arg, fmt);
1606                 vfprintf(stderr, fmt, arg);
1607                 va_end(arg);
1608         }
1609
1610         exit(EXIT_FAILURE);
1611 }
1612
1613 int main(int argc, char **argv)
1614 {
1615         FILE *ifile;
1616         FILE *ofile = stdout;
1617         const char *p;
1618         const char *s;
1619         static const char opt_chars[] = "tfocsl";
1620                                       /* 012345 */
1621         const char *opts[sizeof(opt_chars)]; /* last is infile name */
1622         iconv_t ic;
1623         char ibuf[IBUF];
1624         char obuf[OBUF];
1625         char *pi;
1626         char *po;
1627         size_t ni, no, r, pos;
1628
1629         hide_errors = 0;
1630
1631         for (s = opt_chars ; *s ; s++) {
1632                 opts[ s - opt_chars ] = NULL;
1633         }
1634
1635         progname = *argv;
1636         while (--argc) {
1637                 p = *++argv;
1638                 if ((*p != '-') || (*++p == 0)) {
1639                         break;
1640                 }
1641                 do {
1642                         if ((s = strchr(opt_chars,*p)) == NULL) {
1643                         USAGE:
1644                                 s = basename(progname);
1645                                 fprintf(stderr,
1646                                                 "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
1647                                                 "  or\n%s -l\n", s, s);
1648                                 return EXIT_FAILURE;
1649                         }
1650                         if ((s - opt_chars) < 3) {
1651                                 if ((--argc == 0) || opts[s - opt_chars]) {
1652                                         goto USAGE;
1653                                 }
1654                                 opts[s - opt_chars] = *++argv;
1655                         } else {
1656                                 opts[s - opt_chars] = p;
1657                         }
1658                 } while (*++p);
1659         }
1660
1661         if (opts[5]) {                          /* -l */
1662                 fprintf(stderr, "Recognized codesets:\n");
1663                 for (s = (char *)__iconv_codesets ; *s ; s += *s) {
1664                         fprintf(stderr,"  %s\n", s+2);
1665                 }
1666                 s = __LOCALE_DATA_CODESET_LIST;
1667                 do {
1668                         fprintf(stderr,"  %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s));
1669                 } while (*++s);
1670
1671                 return EXIT_SUCCESS;
1672         }
1673
1674         if (opts[4]) {
1675                 hide_errors = 1;
1676         }
1677
1678         if (!opts[0] || !opts[1]) {
1679                 goto USAGE;
1680         }
1681         if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
1682                 error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
1683         }
1684         if (opts[3]) {                          /* -c */
1685                 ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
1686         }
1687
1688         if ((s = opts[2]) != NULL) {
1689                 if (!(ofile = fopen(s, "w"))) {
1690                         error_msg( "couldn't open %s for writing\n", s);
1691                 }
1692         }
1693
1694         pos = ni = 0;
1695         do {
1696                 if (!argc || ((**argv == '-') && !((*argv)[1]))) {
1697                         ifile = stdin;          /* we don't check for duplicates */
1698                 } else if (!(ifile = fopen(*argv, "r"))) {
1699                         error_msg( "couldn't open %s for reading\n", *argv);
1700                 }
1701
1702                 while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
1703                         pos += r;
1704                         ni += r;
1705                         no = OBUF;
1706                         pi = ibuf;
1707                         po = obuf;
1708                         if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
1709                                 if ((errno != EINVAL) && (errno != E2BIG)) {
1710                                         error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
1711                                 }
1712                         }
1713                         if ((r = OBUF - no) > 0) {
1714                                 if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
1715                                         error_msg( "write error\n");
1716                                 }
1717                         }
1718                         if (ni) {                       /* still bytes in buffer! */
1719                                 memmove(ibuf, pi, ni);
1720                         }
1721                 }
1722
1723                 if (ferror(ifile)) {
1724                         error_msg( "read error\n");
1725                 }
1726
1727                 ++argv;
1728
1729                 if (ifile != stdin) {
1730                         fclose(ifile);
1731                 }
1732
1733         } while (--argc > 0);
1734
1735         iconv_close(ic);
1736
1737         if (ni) {
1738                 error_msg( "incomplete sequence\n");
1739         }
1740
1741         return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
1742                 ? EXIT_SUCCESS : EXIT_FAILURE;
1743 }
1744
1745 #endif
1746 /**********************************************************************/