libc/misc/wchar/wchar.c

   1
   2 /*  Copyright (C) 2002, 2003, 2004     Manuel Novoa III
   3  *
   4  *  This library is free software; you can redistribute it and/or
   5  *  modify it under the terms of the GNU Library General Public
   6  *  License as published by the Free Software Foundation; either
   7  *  version 2 of the License, or (at your option) any later version.
   8  *
   9  *  This library is distributed in the hope that it will be useful,
  10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  *  Library General Public License for more details.
  13  *
  14  *  You should have received a copy of the GNU Library General Public
  15  *  License along with this library; if not, write to the Free
  16  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17  */
  18
  19 /*  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!
  20  *
  21  *  Besides uClibc, I'm using this code in my libc for elks, which is
  22  *  a 16-bit environment with a fairly limited compiler.  It would make
  23  *  things much easier for me if this file isn't modified unnecessarily.
  24  *  In particular, please put any new or replacement functions somewhere
  25  *  else, and modify the makefile to use your version instead.
  26  *  Thanks.  Manuel
  27  *
  28  *  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION! */
  29
  30
  31 /* May 23, 2002     Initial Notes:
  32  *
  33  * I'm still tweaking this stuff, but it passes the tests I've thrown
  34  * at it, and Erik needs it for the gcc port.  The glibc extension
  35  * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
  36  * in the glibc source.  I also need to fix the behavior of
  37  * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
  38  *
  39  * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
  40  * file on my platform (x86) show about 5-10% faster conversion speed than
  41  * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
  42  * individual mbrtowc()/wcrtomb() calls.
  43  *
  44  * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
  45  * as a fail-safe UTF-8 decoder appropriate for a terminal, etc.  which
  46  * needs to deal gracefully with whatever is sent to it.  In that mode,
  47  * it passes Markus Kuhn's UTF-8-test.txt stress test.  I plan to add
  48  * an arg to force that behavior, so the interface will be changing.
  49  *
  50  * I need to fix the error checking for 16-bit wide chars.  This isn't
  51  * an issue for uClibc, but may be for ELKS.  I'm currently not sure
  52  * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
  53  *
  54  * July 1, 2002
  55  *
  56  * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
  57  * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
  58  *    locales.
  59  * Enabled building of a C/POSIX-locale-only version, so full locale support
  60  *    no longer needs to be enabled.
  61  *
  62  * Nov 4, 2002
  63  *
  64  * Fixed a bug in _wchar_wcsntoutf8s().  Don't store wcs position if dst is NULL.
  65  * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
  66  *   order to support %ls in printf.  See comments below for details.
  67  * Change behaviour of wc<->mb functions when in the C locale.  Now they do
  68  *   a 1-1 map for the range 0x80-UCHAR_MAX.  This is for backwards compatibility
  69  *   and consistency with the stds requirements that a printf format string by
  70  *   a valid multibyte string beginning and ending in it's initial shift state.
  71  *
  72  * Nov 5, 2002
  73  *
  74  * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
  75  *
  76  * Nov 7, 2002
  77  *
  78  * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
  79  *   Added some size/speed optimizations and integrated it into my locale
  80  *   framework.  Minimally tested at the moment, but the stub C-locale
  81  *   version (which most people would probably be using) should be fine.
  82  *
  83  * Nov 21, 2002
  84  *
  85  * Revert the wc<->mb changes from earlier this month involving the C-locale.
  86  * Add a couple of ugly hacks to support *wprintf.
  87  * Add a mini iconv() and iconv implementation (requires locale support).
  88  *
  89  * Aug 1, 2003
  90  * Bug fix for mbrtowc.
  91  *
  92  * Aug 18, 2003
  93  * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
  94  *
  95  * Feb 11, 2004
  96  * Bug fix: Fix size check for remaining output space in iconv().
  97  *
  98  * Manuel
  99  */
 100
 101 #define _GNU_SOURCE
 102 #define _ISOC99_SOURCE
 103 #include <errno.h>
 104 #include <stddef.h>
 105 #include <limits.h>
 106 #include <stdint.h>
 107 #include <inttypes.h>
 108 #include <stdlib.h>
 109 #include <stdio.h>
 110 #include <assert.h>
 111 #include <locale.h>
 112 #include <wchar.h>
 113 #include <bits/uClibc_uwchar.h>
 114
 115 /**********************************************************************/
 116 #ifdef __UCLIBC_HAS_LOCALE__
 117 #ifdef __UCLIBC_MJN3_ONLY__
 118 #ifdef L_iswspace
 119 /* generates one warning */
 120 #warning TODO: Fix Cc2wc* and Cwc2c* defines!
 121 #endif
 122 #endif /* __UCLIBC_MJN3_ONLY__ */
 123
 124 #define ENCODING                ((__UCLIBC_CURLOCALE_DATA).encoding)
 125
 126 #define Cc2wc_IDX_SHIFT         __LOCALE_DATA_Cc2wc_IDX_SHIFT
 127 #define Cc2wc_ROW_LEN           __LOCALE_DATA_Cc2wc_ROW_LEN
 128 #define Cwc2c_DOMAIN_MAX        __LOCALE_DATA_Cwc2c_DOMAIN_MAX
 129 #define Cwc2c_TI_SHIFT          __LOCALE_DATA_Cwc2c_TI_SHIFT
 130 #define Cwc2c_TT_SHIFT          __LOCALE_DATA_Cwc2c_TT_SHIFT
 131 #define Cwc2c_TI_LEN            __LOCALE_DATA_Cwc2c_TI_LEN
 132
 133 #ifndef __CTYPE_HAS_UTF_8_LOCALES
 134 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
 135 #endif
 136
 137 #else  /* __UCLIBC_HAS_LOCALE__ */
 138
 139 #ifdef __UCLIBC_MJN3_ONLY__
 140 #ifdef L_btowc
 141 /* emit only once */
 142 #warning fix preprocessor logic testing locale settings
 143 #endif
 144 #endif
 145
 146 #define ENCODING (__ctype_encoding_7_bit)
 147 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 148 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
 149 #endif
 150 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 151 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
 152 #endif
 153 #undef L__wchar_utf8sntowcs
 154 #undef L__wchar_wcsntoutf8s
 155
 156 #endif /* __UCLIBC_HAS_LOCALE__ */
 157 /**********************************************************************/
 158
 159 #if WCHAR_MAX > 0xffffUL
 160 #define UTF_8_MAX_LEN 6
 161 #else
 162 #define UTF_8_MAX_LEN 3
 163 #endif
 164
 165 #define KUHN 1
 166
 167 /* Implementation-specific work functions. */
 168
 169 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 170                                         const char **__restrict src, size_t n,
 171                                         mbstate_t *ps, int allow_continuation) attribute_hidden;
 172
 173 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 174                                         const wchar_t **__restrict src, size_t wn) attribute_hidden;
 175
 176 /**********************************************************************/
 177 #ifdef L_btowc
 178
 179 libc_hidden_proto(mbrtowc)
 180
 181 wint_t btowc(int c)
 182 {
 183 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 184
 185         wchar_t wc;
 186         unsigned char buf[1];
 187         mbstate_t mbstate;
 188
 189         if (c != EOF) {
 190                 *buf = (unsigned char) c;
 191                 mbstate.__mask = 0;             /* Initialize the mbstate. */
 192                 if (mbrtowc(&wc, buf, 1, &mbstate) <= 1) {
 193                         return wc;
 194                 }
 195         }
 196         return WEOF;
 197
 198 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
 199
 200 #ifdef __UCLIBC_HAS_LOCALE__
 201         assert((ENCODING == __ctype_encoding_7_bit)
 202                    || (ENCODING == __ctype_encoding_utf8));
 203 #endif /* __UCLIBC_HAS_LOCALE__ */
 204
 205         /* If we don't have 8-bit locale support, then this is trivial since
 206          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 207         return (((unsigned int)c) < 0x80) ? c : WEOF;
 208
 209 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
 210 }
 211 libc_hidden_proto(btowc)
 212 libc_hidden_def(btowc)
 213
 214 #endif
 215 /**********************************************************************/
 216 #ifdef L_wctob
 217
 218 /* Note: We completely ignore ps in all currently supported conversions. */
 219
 220 libc_hidden_proto(wcrtomb)
 221
 222 int wctob(wint_t c)
 223 {
 224 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 225
 226         unsigned char buf[MB_LEN_MAX];
 227
 228         return (wcrtomb(buf, c, NULL) == 1) ? *buf : EOF;
 229
 230 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
 231
 232 #ifdef __UCLIBC_HAS_LOCALE__
 233         assert((ENCODING == __ctype_encoding_7_bit)
 234                    || (ENCODING == __ctype_encoding_utf8));
 235 #endif /* __UCLIBC_HAS_LOCALE__ */
 236
 237         /* If we don't have 8-bit locale support, then this is trivial since
 238          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 239
 240         /* TODO: need unsigned version of wint_t... */
 241 /*      return (((unsigned int)c) < 0x80) ? c : WEOF; */
 242         return ((c >= 0) && (c < 0x80)) ? c : EOF;
 243
 244 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
 245 }
 246
 247 #endif
 248 /**********************************************************************/
 249 #ifdef L_mbsinit
 250
 251 int mbsinit(const mbstate_t *ps)
 252 {
 253         return !ps || !ps->__mask;
 254 }
 255 libc_hidden_proto(mbsinit)
 256 libc_hidden_def(mbsinit)
 257
 258 #endif
 259 /**********************************************************************/
 260 #ifdef L_mbrlen
 261
 262 libc_hidden_proto(mbrtowc)
 263
 264 size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
 265 {
 266         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 267
 268         return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
 269 }
 270 libc_hidden_proto(mbrlen)
 271 libc_hidden_def(mbrlen)
 272
 273 #endif
 274 /**********************************************************************/
 275 #ifdef L_mbrtowc
 276
 277 libc_hidden_proto(mbsnrtowcs)
 278
 279 size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
 280                            size_t n, mbstate_t *__restrict ps)
 281 {
 282         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 283         wchar_t wcbuf[1];
 284         const char *p;
 285         size_t r;
 286         char empty_string[1];           /* Avoid static to be fPIC friendly. */
 287
 288         if (!ps) {
 289                 ps = &mbstate;
 290         }
 291
 292         if (!s) {
 293                 pwc = (wchar_t *) s;    /* NULL */
 294                 empty_string[0] = 0;    /* Init the empty string when necessary. */
 295                 s = empty_string;
 296                 n = 1;
 297         } else if (!n) {
 298                 /* TODO: change error code? */
 299                 return (ps->__mask && (ps->__wc == 0xffffU))
 300                         ? ((size_t) -1) : ((size_t) -2);
 301         }
 302
 303         p = s;
 304
 305 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 306         /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
 307         if (ENCODING == __ctype_encoding_utf8) {
 308                 if (!pwc) {
 309                         pwc = wcbuf;
 310                 }
 311                 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
 312                 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
 313         }
 314 #endif
 315
 316 #ifdef __UCLIBC_MJN3_ONLY__
 317 #warning TODO: This adds a trailing nul!
 318 #endif /* __UCLIBC_MJN3_ONLY__ */
 319
 320         r = mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
 321
 322         if (((ssize_t) r) >= 0) {
 323                 if (pwc) {
 324                         *pwc = *wcbuf;
 325                 }
 326         }
 327         return (size_t) r;
 328 }
 329 libc_hidden_proto(mbrtowc)
 330 libc_hidden_def(mbrtowc)
 331
 332 #endif
 333 /**********************************************************************/
 334 #ifdef L_wcrtomb
 335
 336 libc_hidden_proto(wcsnrtombs)
 337
 338 /* Note: We completely ignore ps in all currently supported conversions. */
 339 /* TODO: Check for valid state anyway? */
 340
 341 size_t wcrtomb(register char *__restrict s, wchar_t wc,
 342                            mbstate_t *__restrict ps)
 343 {
 344 #ifdef __UCLIBC_MJN3_ONLY__
 345 #warning TODO: Should wcsnrtombs nul-terminate unconditionally?  Check glibc.
 346 #endif /* __UCLIBC_MJN3_ONLY__ */
 347         wchar_t wcbuf[1];
 348         const wchar_t *pwc;
 349         size_t r;
 350         char buf[MB_LEN_MAX];
 351
 352         if (!s) {
 353                 s = buf;
 354                 wc = 0;
 355         }
 356
 357         pwc = wcbuf;
 358         wcbuf[0] = wc;
 359
 360         r = wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
 361         return (r != 0) ? r : 1;
 362 }
 363 libc_hidden_proto(wcrtomb)
 364 libc_hidden_def(wcrtomb)
 365
 366 #endif
 367 /**********************************************************************/
 368 #ifdef L_mbsrtowcs
 369
 370 libc_hidden_proto(mbsnrtowcs)
 371
 372 size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 373                                  size_t len, mbstate_t *__restrict ps)
 374 {
 375         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 376
 377         return mbsnrtowcs(dst, src, SIZE_MAX, len,
 378                                                 ((ps != NULL) ? ps : &mbstate));
 379 }
 380 libc_hidden_proto(mbsrtowcs)
 381 libc_hidden_def(mbsrtowcs)
 382
 383 #endif
 384 /**********************************************************************/
 385 #ifdef L_wcsrtombs
 386
 387 /* Note: We completely ignore ps in all currently supported conversions.
 388
 389  * TODO: Check for valid state anyway? */
 390
 391 libc_hidden_proto(wcsnrtombs)
 392
 393 size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
 394                                  size_t len, mbstate_t *__restrict ps)
 395 {
 396         return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
 397 }
 398 libc_hidden_proto(wcsrtombs)
 399 libc_hidden_def(wcsrtombs)
 400
 401 #endif
 402 /**********************************************************************/
 403 #ifdef L__wchar_utf8sntowcs
 404
 405 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
 406  * UTF-8-test.txt strss test.
 407  */
 408 /*  #define DECODER */
 409
 410 #ifdef DECODER
 411 #ifndef KUHN
 412 #define KUHN
 413 #endif
 414 #endif
 415
 416 size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 417                                                   const char **__restrict src, size_t n,
 418                                                   mbstate_t *ps, int allow_continuation)
 419 {
 420         register const char *s;
 421         __uwchar_t mask;
 422         __uwchar_t wc;
 423         wchar_t wcbuf[1];
 424         size_t count;
 425         int incr;
 426
 427         s = *src;
 428
 429         assert(s != NULL);
 430         assert(ps != NULL);
 431
 432         incr = 1;
 433         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 434          * wprintf, we need to be able to compute the number of wchars needed
 435          * for the mbs conversion, not to exceed the precision specified.
 436          * But if dst is NULL, the return value is the length assuming a
 437          * sufficiently sized buffer.  So, we allow passing of (wchar_t *) ps
 438          * as pwc in order to flag that we really want the length, subject
 439          * to the restricted buffer size and no partial conversions.
 440          * See mbsnrtowcs() as well. */
 441         if (!pwc || (pwc == ((wchar_t *)ps))) {
 442                 if (!pwc) {
 443                         wn = SIZE_MAX;
 444                 }
 445                 pwc = wcbuf;
 446                 incr = 0;
 447         }
 448
 449         /* This is really here only to support the glibc extension function
 450          * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
 451          * check on the validity of the mbstate. */
 452         if (!(count = wn)) {
 453                 return 0;
 454         }
 455
 456         if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
 457 #ifdef DECODER
 458                 wc = (__uwchar_t) ps->__wc;
 459                 if (n) {
 460                         goto CONTINUE;
 461                 }
 462                 goto DONE;
 463 #else
 464                 if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
 465                         /* TODO: change error code here and below? */
 466                         if (n) {
 467                                 goto CONTINUE;
 468                         }
 469                         goto DONE;
 470                 }
 471                 __set_errno(EILSEQ);
 472                 return (size_t) -1;             /* We're in an error state. */
 473 #endif
 474         }
 475
 476         do {
 477                 if (!n) {
 478                         goto DONE;
 479                 }
 480                 --n;
 481                 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
 482                         mask = 0x40;
 483 #ifdef __UCLIBC_MJN3_ONLY__
 484 #warning TODO: Fix range for 16 bit wchar_t case.
 485 #endif
 486                         if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) {
 487                                 goto START;
 488                         }
 489                 BAD:
 490 #ifdef DECODER
 491                         wc = 0xfffdU;
 492                         goto COMPLETE;
 493 #else
 494                         ps->__mask = mask;
 495                         ps->__wc = 0xffffU;
 496                         __set_errno(EILSEQ);
 497                         return (size_t) -1;     /* Illegal start byte! */
 498 #endif
 499
 500                 CONTINUE:
 501                         while (n) {
 502                                 --n;
 503                                 if ((*s & 0xc0) != 0x80) {
 504                                         goto BAD;
 505                                 }
 506                                 mask <<= 5;
 507                                 wc <<= 6;
 508                                 wc += (*s & 0x3f);      /* keep seperate for bcc (smaller code) */
 509                                 ++s;
 510                         START:
 511                                 wc &= ~(mask << 1);
 512
 513                                 if ((wc & mask) == 0) { /* Character completed. */
 514                                         if ((mask >>= 5) == 0x40) {
 515                                                 mask += mask;
 516                                         }
 517                                         /* Check for invalid sequences (longer than necessary)
 518                                          * and invalid chars.  */
 519                                         if ( (wc < mask) /* Sequence not minimal length. */
 520 #ifdef KUHN
 521 #if UTF_8_MAX_LEN == 3
 522 #error broken since mask can overflow!!
 523                                                  /* For plane 0, these are the only defined values.*/
 524                                                  || (wc > 0xfffdU)
 525 #else
 526                                                  /* Note that we don't need to worry about exceeding */
 527                                                  /* 31 bits as that is the most that UTF-8 provides. */
 528                                                  || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 529 #endif
 530                                                  || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 531 #endif /* KUHN */
 532                                                  ) {
 533                                                 goto BAD;
 534                                         }
 535                                         goto COMPLETE;
 536                                 }
 537                         }
 538                         /* Character potentially valid but incomplete. */
 539                         if (!allow_continuation) {
 540                                 if (count != wn) {
 541                                         return 0;
 542                                 }
 543                                 /* NOTE: The following can fail if you allow and then disallow
 544                                  * continuation!!! */
 545 #if UTF_8_MAX_LEN == 3
 546 #error broken since mask can overflow!!
 547 #endif
 548                                 /* Need to back up... */
 549                                 do {
 550                                         --s;
 551                                 } while ((mask >>= 5) >= 0x40);
 552                                 goto DONE;
 553                         }
 554                         ps->__mask = (wchar_t) mask;
 555                         ps->__wc = (wchar_t) wc;
 556                         *src = s;
 557                         return (size_t) -2;
 558                 }
 559         COMPLETE:
 560                 *pwc = wc;
 561                 pwc += incr;
 562         }
 563 #ifdef DECODER
 564         while (--count);
 565 #else
 566         while (wc && --count);
 567
 568         if (!wc) {
 569                 s = NULL;
 570         }
 571 #endif
 572
 573  DONE:
 574         /* ps->__wc is irrelavent here. */
 575         ps->__mask = 0;
 576         if (pwc != wcbuf) {
 577                 *src = s;
 578         }
 579
 580         return wn - count;
 581 }
 582
 583 #endif
 584 /**********************************************************************/
 585 #ifdef L__wchar_wcsntoutf8s
 586
 587 size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 588                                                   const wchar_t **__restrict src, size_t wn)
 589 {
 590         register char *p;
 591         size_t len, t;
 592         __uwchar_t wc;
 593         const __uwchar_t *swc;
 594         int store;
 595         char buf[MB_LEN_MAX];
 596         char m;
 597
 598         store = 1;
 599         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 600          * printf, we need to be able to compute the number of bytes needed
 601          * for the mbs conversion, not to exceed the precision specified.
 602          * But if dst is NULL, the return value is the length assuming a
 603          * sufficiently sized buffer.  So, we allow passing of (char *) src
 604          * as dst in order to flag that we really want the length, subject
 605          * to the restricted buffer size and no partial conversions.
 606          * See wcsnrtombs() as well. */
 607         if (!s || (s == ((char *) src))) {
 608                 if (!s) {
 609                         n = SIZE_MAX;
 610                 }
 611             s = buf;
 612                 store = 0;
 613         }
 614
 615         t = n;
 616         swc = (const __uwchar_t *) *src;
 617
 618         assert(swc != NULL);
 619
 620         while (wn && t) {
 621                 wc = *swc;
 622
 623                 *s = wc;
 624                 len = 1;
 625
 626                 if (wc >= 0x80) {
 627 #ifdef KUHN
 628                         if (
 629 #if UTF_8_MAX_LEN == 3
 630                                 /* For plane 0, these are the only defined values.*/
 631                                 /* Note that we don't need to worry about exceeding */
 632                                 /* 31 bits as that is the most that UTF-8 provides. */
 633                                 (wc > 0xfffdU)
 634 #else
 635                                 /* UTF_8_MAX_LEN == 6 */
 636                                 (wc > 0x7fffffffUL)
 637                                 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 638 #endif
 639                                 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 640                                 ) {
 641                                 __set_errno(EILSEQ);
 642                                 return (size_t) -1;
 643                         }
 644 #else  /* KUHN */
 645 #if UTF_8_MAX_LEN != 3
 646                         if (wc > 0x7fffffffUL) { /* Value too large. */
 647                                 __set_errno(EILSEQ);
 648                                 return (size_t) -1;
 649                         }
 650 #endif
 651 #endif /* KUHN */
 652
 653                         wc >>= 1;
 654                         p = s;
 655                         do {
 656                                 ++p;
 657                         } while (wc >>= 5);
 658                         wc = *swc;
 659                         if ((len = p - s) > t) { /* Not enough space. */
 660                                 break;
 661                         }
 662
 663                         m = 0x80;
 664                         while( p>s ) {
 665                                 m = (m >> 1) | 0x80;
 666                                 *--p = (wc & 0x3f) | 0x80;
 667                                 wc >>= 6;
 668                         }
 669                         *s |= (m << 1);
 670                 } else if (wc == 0) {   /* End of string. */
 671                         swc = NULL;
 672                         break;
 673                 }
 674
 675                 ++swc;
 676                 --wn;
 677                 t -= len;
 678                 if (store) {
 679                         s += len;
 680                 }
 681         }
 682
 683         if (store) {
 684                 *src = (const wchar_t *) swc;
 685         }
 686
 687         return n - t;
 688 }
 689
 690
 691 #endif
 692 /**********************************************************************/
 693 #ifdef L_mbsnrtowcs
 694
 695 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 696
 697 size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 698                                         size_t NMC, size_t len, mbstate_t *__restrict ps)
 699 {
 700         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 701         wchar_t wcbuf[1];
 702         const char *s;
 703         size_t count;
 704         int incr;
 705
 706         if (!ps) {
 707                 ps = &mbstate;
 708         }
 709
 710 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 711         if (ENCODING == __ctype_encoding_utf8) {
 712                 size_t r;
 713                 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
 714                                 != (size_t) -2) ? r : 0;
 715         }
 716 #endif
 717         incr = 1;
 718         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 719          * wprintf, we need to be able to compute the number of wchars needed
 720          * for the mbs conversion, not to exceed the precision specified.
 721          * But if dst is NULL, the return value is the length assuming a
 722          * sufficiently sized buffer.  So, we allow passing of ((wchar_t *)ps)
 723          * as dst in order to flag that we really want the length, subject
 724          * to the restricted buffer size and no partial conversions.
 725          * See _wchar_utf8sntowcs() as well. */
 726         if (!dst || (dst == ((wchar_t *)ps))) {
 727                 if (!dst) {
 728                         len = SIZE_MAX;
 729                 }
 730                 dst = wcbuf;
 731                 incr = 0;
 732         }
 733
 734         /* Since all the following encodings are single-byte encodings... */
 735         if (len > NMC) {
 736                 len = NMC;
 737         }
 738
 739         count = len;
 740         s = *src;
 741
 742 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 743         if (ENCODING == __ctype_encoding_8_bit) {
 744                 wchar_t wc;
 745                 while (count) {
 746                         if ((wc = ((unsigned char)(*s))) >= 0x80) {     /* Non-ASCII... */
 747                                 wc -= 0x80;
 748                                 wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
 749                                                   (__UCLIBC_CURLOCALE_DATA.idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
 750                                                    << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
 751                                 if (!wc) {
 752                                         goto BAD;
 753                                 }
 754                         }
 755                         if (!(*dst = wc)) {
 756                                 s = NULL;
 757                                 break;
 758                         }
 759                         dst += incr;
 760                         ++s;
 761                         --count;
 762                 }
 763                 if (dst != wcbuf) {
 764                         *src = s;
 765                 }
 766                 return len - count;
 767         }
 768 #endif
 769
 770 #ifdef __UCLIBC_HAS_LOCALE__
 771         assert(ENCODING == __ctype_encoding_7_bit);
 772 #endif
 773
 774         while (count) {
 775                 if ((*dst = (unsigned char) *s) == 0) {
 776                         s = NULL;
 777                         break;
 778                 }
 779                 if (*dst >= 0x80) {
 780 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 781                 BAD:
 782 #endif
 783                         __set_errno(EILSEQ);
 784                         return (size_t) -1;
 785                 }
 786                 ++s;
 787                 dst += incr;
 788                 --count;
 789         }
 790         if (dst != wcbuf) {
 791                 *src = s;
 792         }
 793         return len - count;
 794 }
 795 libc_hidden_proto(mbsnrtowcs)
 796 libc_hidden_def(mbsnrtowcs)
 797
 798 #endif
 799 /**********************************************************************/
 800 #ifdef L_wcsnrtombs
 801
 802 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 803
 804 /* Note: We completely ignore ps in all currently supported conversions.
 805  * TODO: Check for valid state anyway? */
 806
 807 size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
 808                                         size_t NWC, size_t len, mbstate_t *__restrict ps)
 809 {
 810         const __uwchar_t *s;
 811         size_t count;
 812         int incr;
 813         char buf[MB_LEN_MAX];
 814
 815 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 816         if (ENCODING == __ctype_encoding_utf8) {
 817                 return _wchar_wcsntoutf8s(dst, len, src, NWC);
 818         }
 819 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
 820
 821         incr = 1;
 822         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 823          * printf, we need to be able to compute the number of bytes needed
 824          * for the mbs conversion, not to exceed the precision specified.
 825          * But if dst is NULL, the return value is the length assuming a
 826          * sufficiently sized buffer.  So, we allow passing of (char *) src
 827          * as dst in order to flag that we really want the length, subject
 828          * to the restricted buffer size and no partial conversions.
 829          * See _wchar_wcsntoutf8s() as well. */
 830         if (!dst || (dst == ((char *) src))) {
 831                 if (!dst) {
 832                         len = SIZE_MAX;
 833                 }
 834                 dst = buf;
 835                 incr = 0;
 836         }
 837
 838         /* Since all the following encodings are single-byte encodings... */
 839         if (len > NWC) {
 840                 len = NWC;
 841         }
 842
 843         count = len;
 844         s = (const __uwchar_t *) *src;
 845
 846 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 847         if (ENCODING == __ctype_encoding_8_bit) {
 848                 __uwchar_t wc;
 849                 __uwchar_t u;
 850                 while (count) {
 851                         if ((wc = *s) <= 0x7f) {
 852                                 if (!(*dst = (unsigned char) wc)) {
 853                                         s = NULL;
 854                                         break;
 855                                 }
 856                         } else {
 857                                 u = 0;
 858                                 if (wc <= Cwc2c_DOMAIN_MAX) {
 859                                         u = __UCLIBC_CURLOCALE_DATA.idx8wc2c[wc >> (Cwc2c_TI_SHIFT
 860                                                                                                                 + Cwc2c_TT_SHIFT)];
 861                                         u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
 862                                                                         + ((wc >> Cwc2c_TT_SHIFT)
 863                                                                            & ((1 << Cwc2c_TI_SHIFT)-1))];
 864                                         u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
 865                                                                         + (u << Cwc2c_TT_SHIFT)
 866                                                                         + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
 867                                 }
 868
 869 #define __WCHAR_REPLACEMENT_CHAR '?'
 870 #ifdef __WCHAR_REPLACEMENT_CHAR
 871                                 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
 872 #else  /* __WCHAR_REPLACEMENT_CHAR */
 873                                 if (!u) {
 874                                         goto BAD;
 875                                 }
 876                                 *dst = (unsigned char) u;
 877 #endif /* __WCHAR_REPLACEMENT_CHAR */
 878                         }
 879                         ++s;
 880                         dst += incr;
 881                         --count;
 882                 }
 883                 if (dst != buf) {
 884                         *src = (const wchar_t *) s;
 885                 }
 886                 return len - count;
 887         }
 888 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
 889
 890 #ifdef __UCLIBC_HAS_LOCALE__
 891         assert(ENCODING == __ctype_encoding_7_bit);
 892 #endif
 893
 894         while (count) {
 895                 if (*s >= 0x80) {
 896 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
 897                 BAD:
 898 #endif
 899                         __set_errno(EILSEQ);
 900                         return (size_t) -1;
 901                 }
 902                 if ((*dst = (unsigned char) *s) == 0) {
 903                         s = NULL;
 904                         break;
 905                 }
 906                 ++s;
 907                 dst += incr;
 908                 --count;
 909         }
 910         if (dst != buf) {
 911                 *src = (const wchar_t *) s;
 912         }
 913         return len - count;
 914 }
 915 libc_hidden_proto(wcsnrtombs)
 916 libc_hidden_def(wcsnrtombs)
 917
 918 #endif
 919 /**********************************************************************/
 920 #ifdef L_wcswidth
 921
 922 #ifdef __UCLIBC_MJN3_ONLY__
 923 #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
 924 #warning TODO: Update wcwidth to match latest by Kuhn.
 925 #endif
 926
 927 #if defined(__UCLIBC_HAS_LOCALE__) && \
 928 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
 929
 930 static const unsigned char new_idx[] = {
 931         0,    5,    5,    6,   10,   15,   28,   39,
 932         48,   48,   71,   94,  113,  128,  139,  154,
 933         175,  186,  188,  188,  188,  188,  188,  188,
 934         203,  208,  208,  208,  208,  208,  208,  208,
 935         208,  219,  219,  219,  222,  222,  222,  222,
 936         222,  222,  222,  222,  222,  222,  222,  224,
 937         224,  231,  231,  231,  231,  231,  231,  231,
 938         231,  231,  231,  231,  231,  231,  231,  231,
 939         231,  231,  231,  231,  231,  231,  231,  231,
 940         231,  231,  231,  231,  231,  231,  231,  231,
 941         231,  231,  231,  231,  231,  231,  231,  231,
 942         231,  231,  231,  231,  231,  231,  231,  231,
 943         231,  231,  231,  231,  231,  231,  231,  231,
 944         231,  231,  231,  231,  231,  231,  231,  231,
 945         231,  231,  231,  231,  231,  231,  231,  231,
 946         231,  231,  231,  231,  231,  231,  231,  231,
 947         231,  231,  231,  231,  231,  231,  231,  231,
 948         231,  231,  231,  231,  231,  231,  231,  231,
 949         231,  231,  231,  231,  231,  231,  231,  231,
 950         231,  231,  231,  231,  231,  231,  231,  231,
 951         231,  231,  231,  231,  231,  233,  233,  233,
 952         233,  233,  233,  233,  234,  234,  234,  234,
 953         234,  234,  234,  234,  234,  234,  234,  234,
 954         234,  234,  234,  234,  234,  234,  234,  234,
 955         234,  234,  234,  234,  234,  234,  234,  234,
 956         234,  234,  234,  234,  234,  234,  234,  234,
 957         234,  234,  234,  234,  234,  234,  234,  234,
 958         236,  236,  236,  236,  236,  236,  236,  236,
 959         236,  236,  236,  236,  236,  236,  236,  236,
 960         236,  236,  236,  236,  236,  236,  236,  236,
 961         236,  236,  236,  236,  236,  236,  236,  236,
 962         236,  237,  237,  238,  241,  241,  242,  249,
 963         255,
 964 };
 965
 966 static const unsigned char new_tbl[] = {
 967         0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
 968         0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
 969         0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
 970         0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
 971         0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
 972         0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
 973         0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
 974         0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
 975         0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
 976         0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
 977         0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
 978         0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
 979         0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
 980         0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
 981         0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
 982         0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
 983         0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
 984         0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
 985         0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
 986         0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
 987         0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
 988         0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
 989         0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
 990         0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
 991         0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
 992         0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
 993         0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
 994         0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
 995         0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
 996         0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
 997         0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
 998         0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
 999 };
1000
1001 static const signed char new_wtbl[] = {
1002         0,   -1,    1,   -1,    1,    1,    0,    1,
1003         0,    1,    1,    0,    1,    0,    1,    1,
1004         0,    1,    0,    1,    0,    1,    0,    1,
1005         0,    1,    0,    1,    1,    0,    1,    0,
1006         1,    0,    1,    0,    1,    0,    1,    1,
1007         0,    1,    0,    1,    0,    1,    0,    1,
1008         1,    0,    1,    0,    1,    0,    1,    0,
1009         1,    0,    1,    0,    1,    0,    1,    0,
1010         1,    0,    1,    0,    1,    0,    1,    1,
1011         0,    1,    0,    1,    0,    1,    0,    1,
1012         0,    1,    0,    1,    0,    1,    0,    1,
1013         0,    1,    0,    1,    0,    1,    1,    0,
1014         1,    0,    1,    0,    1,    0,    1,    0,
1015         1,    0,    1,    0,    1,    0,    1,    0,
1016         1,    1,    0,    1,    0,    1,    0,    1,
1017         0,    1,    0,    1,    0,    1,    0,    1,
1018         1,    0,    1,    0,    1,    0,    1,    0,
1019         1,    0,    1,    1,    0,    1,    0,    1,
1020         0,    1,    0,    1,    0,    1,    0,    1,
1021         0,    1,    1,    0,    1,    0,    1,    0,
1022         1,    0,    1,    0,    1,    0,    1,    0,
1023         1,    0,    1,    0,    1,    0,    1,    1,
1024         0,    1,    0,    1,    0,    1,    0,    1,
1025         0,    1,    2,    0,    1,    0,    1,    0,
1026         1,    0,    1,    0,    1,    0,    1,    0,
1027         1,    0,    1,    1,    0,    1,    0,    1,
1028         1,    0,    1,    0,    1,    0,    1,    0,
1029         1,    0,    1,    1,    2,    1,    1,    2,
1030         2,    0,    2,    1,    2,    0,    2,    2,
1031         1,    1,    2,    1,    1,    2,    1,    0,
1032         1,    1,    0,    1,    0,    1,    2,    1,
1033         0,    2,    1,    2,    1,    0,    1,
1034 };
1035
1036 libc_hidden_proto(wcsnrtombs)
1037
1038 int wcswidth(const wchar_t *pwcs, size_t n)
1039 {
1040     int h, l, m, count;
1041     wchar_t wc;
1042     unsigned char b;
1043
1044         if (ENCODING == __ctype_encoding_7_bit) {
1045                 size_t i;
1046
1047                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1048                         if (pwcs[i] != ((unsigned char)(pwcs[i]))) {
1049                                 return -1;
1050                         }
1051                 }
1052         }
1053 #ifdef __CTYPE_HAS_8_BIT_LOCALES
1054         else if (ENCODING == __ctype_encoding_8_bit) {
1055                 mbstate_t mbstate;
1056
1057                 mbstate.__mask = 0;                     /* Initialize the mbstate. */
1058                 if (wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1059                         return -1;
1060                 }
1061         }
1062 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
1063 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1064         /* For stricter handling of allowed unicode values... see comments above. */
1065         else if (ENCODING == __ctype_encoding_utf8) {
1066                 size_t i;
1067
1068                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1069                         if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1070                                  || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1071                                 ) {
1072                                 return -1;
1073                         }
1074                 }
1075         }
1076 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1077
1078     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1079                 if (wc <= 0xff) {
1080                         /* If we're here, wc != 0. */
1081                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1082                                 return -1;
1083                         }
1084                         ++count;
1085                         continue;
1086                 }
1087                 if (((unsigned int) wc) <= 0xffff) {
1088                         b = wc & 0xff;
1089                         h = (wc >> 8);
1090                         l = new_idx[h];
1091                         h = new_idx[h+1];
1092                         while ((m = (l+h) >> 1) != l) {
1093                                 if (b >= new_tbl[m]) {
1094                                         l = m;
1095                                 } else {                /* wc < tbl[m] */
1096                                         h = m;
1097                                 }
1098                         }
1099                         count += new_wtbl[l]; /* none should be -1. */
1100                         continue;
1101                 }
1102
1103                 /* Redo this to minimize average number of compares?*/
1104                 if (wc >= 0x1d167) {
1105                         if (wc <= 0x1d1ad) {
1106                                 if ((wc <= 0x1d169
1107                                          || (wc >= 0x1d173
1108                                                  && (wc <= 0x1d182
1109                                                          || (wc >= 0x1d185
1110                                                                  && (wc <= 0x1d18b
1111                                                                          || (wc >= 0x1d1aa))))))
1112                                         ) {
1113                                         continue;
1114                                 }
1115                         } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1116                                 continue;
1117                         } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1118                                 ++count;                /* need 2.. add one here */
1119                         }
1120 #if (WCHAR_MAX > 0x7fffffffL)
1121                         else if (wc > 0x7fffffffL) {
1122                                 return -1;
1123                         }
1124 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1125                 }
1126
1127                 ++count;
1128     }
1129
1130     return count;
1131 }
1132
1133 #else  /*  __UCLIBC_HAS_LOCALE__ */
1134
1135 int wcswidth(const wchar_t *pwcs, size_t n)
1136 {
1137         int count;
1138         wchar_t wc;
1139
1140     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1141                 if (wc <= 0xff) {
1142                         /* If we're here, wc != 0. */
1143                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1144                                 return -1;
1145                         }
1146                         ++count;
1147                         continue;
1148                 } else {
1149                         return -1;
1150                 }
1151         }
1152
1153         return count;
1154 }
1155
1156 #endif /*  __UCLIBC_HAS_LOCALE__ */
1157
1158 libc_hidden_proto(wcswidth)
1159 libc_hidden_def(wcswidth)
1160
1161 #endif
1162 /**********************************************************************/
1163 #ifdef L_wcwidth
1164
1165 libc_hidden_proto(wcswidth)
1166
1167 int wcwidth(wchar_t wc)
1168 {
1169     return wcswidth(&wc, 1);
1170 }
1171
1172 #endif
1173 /**********************************************************************/
1174
1175
1176 typedef struct {
1177         mbstate_t tostate;
1178         mbstate_t fromstate;
1179         int tocodeset;
1180         int fromcodeset;
1181         int frombom;
1182         int tobom;
1183         int fromcodeset0;
1184         int frombom0;
1185         int tobom0;
1186         int skip_invalid_input;         /* To support iconv -c option. */
1187 } _UC_iconv_t;
1188
1189
1190
1191 #ifdef L_iconv
1192
1193 #include <iconv.h>
1194 #include <string.h>
1195 #include <endian.h>
1196 #include <byteswap.h>
1197
1198 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1199 #error unsupported endianness for iconv
1200 #endif
1201
1202 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1203 #error currently iconv requires 8 bit locales
1204 #endif
1205 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1206 #error currently iconv requires UTF-8 locales
1207 #endif
1208
1209
1210 enum {
1211         IC_WCHAR_T = 0xe0,
1212         IC_MULTIBYTE = 0xe0,
1213 #if __BYTE_ORDER == __BIG_ENDIAN
1214         IC_UCS_4 =      0xec,
1215         IC_UTF_32 = 0xe4,
1216         IC_UCS_2 =      0xe2,
1217         IC_UTF_16 = 0xea,
1218 #else
1219         IC_UCS_4 =      0xed,
1220         IC_UTF_32 = 0xe5,
1221         IC_UCS_2 =      0xe3,
1222         IC_UTF_16 = 0xeb,
1223 #endif
1224         IC_UTF_8 = 2,
1225         IC_ASCII = 1
1226 };
1227
1228 /* For the multibyte
1229  * bit 0 means swap endian
1230  * bit 1 means 2 byte
1231  * bit 2 means 4 byte
1232  *
1233  */
1234
1235 const unsigned char __iconv_codesets[] =
1236         "\x0a\xe0""WCHAR_T\x00"         /* superset of UCS-4 but platform-endian */
1237 #if __BYTE_ORDER == __BIG_ENDIAN
1238         "\x08\xec""UCS-4\x00"           /* always BE */
1239         "\x0a\xec""UCS-4BE\x00"
1240         "\x0a\xed""UCS-4LE\x00"
1241         "\x09\fe4""UTF-32\x00"          /* platform endian with BOM */
1242         "\x0b\xe4""UTF-32BE\x00"
1243         "\x0b\xe5""UTF-32LE\x00"
1244         "\x08\xe2""UCS-2\x00"           /* always BE */
1245         "\x0a\xe2""UCS-2BE\x00"
1246         "\x0a\xe3""UCS-2LE\x00"
1247         "\x09\xea""UTF-16\x00"          /* platform endian with BOM */
1248         "\x0b\xea""UTF-16BE\x00"
1249         "\x0b\xeb""UTF-16LE\x00"
1250 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1251         "\x08\xed""UCS-4\x00"           /* always BE */
1252         "\x0a\xed""UCS-4BE\x00"
1253         "\x0a\xec""UCS-4LE\x00"
1254         "\x09\xf4""UTF-32\x00"          /* platform endian with BOM */
1255         "\x0b\xe5""UTF-32BE\x00"
1256         "\x0b\xe4""UTF-32LE\x00"
1257         "\x08\xe3""UCS-2\x00"           /* always BE */
1258         "\x0a\xe3""UCS-2BE\x00"
1259         "\x0a\xe2""UCS-2LE\x00"
1260         "\x09\xfa""UTF-16\x00"          /* platform endian with BOM */
1261         "\x0b\xeb""UTF-16BE\x00"
1262         "\x0b\xea""UTF-16LE\x00"
1263 #endif
1264         "\x08\x02""UTF-8\x00"
1265         "\x0b\x01""US-ASCII\x00"
1266         "\x07\x01""ASCII";                      /* Must be last! (special case to save a nul) */
1267
1268 libc_hidden_proto(strcasecmp)
1269
1270 static int find_codeset(const char *name)
1271 {
1272         const unsigned char *s;
1273         int codeset;
1274
1275         for (s = __iconv_codesets ; *s ; s += *s) {
1276                 if (!strcasecmp(s+2, name)) {
1277                         return s[1];
1278                 }
1279         }
1280
1281         /* The following is ripped from find_locale in locale.c. */
1282
1283         /* TODO: maybe CODESET_LIST + *s ??? */
1284         /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1285         codeset = 2;
1286         s = __LOCALE_DATA_CODESET_LIST;
1287         do {
1288                 ++codeset;              /* Increment codeset first. */
1289                 if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1290                         return codeset;
1291                 }
1292         } while (*++s);
1293
1294         return 0;                       /* No matching codeset! */
1295 }
1296
1297 iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1298 {
1299         register _UC_iconv_t *px;
1300         int tocodeset, fromcodeset;
1301
1302         if (((tocodeset = find_codeset(tocode)) != 0)
1303                 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1304                 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1305                         px->tocodeset = tocodeset;
1306                         px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
1307                         px->fromcodeset0 = px->fromcodeset = fromcodeset;
1308                         px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
1309                         px->skip_invalid_input = px->tostate.__mask
1310                                 = px->fromstate.__mask = 0;
1311                         return (iconv_t) px;
1312                 }
1313         } else {
1314                 __set_errno(EINVAL);
1315         }
1316         return (iconv_t)(-1);
1317 }
1318
1319 int weak_function iconv_close(iconv_t cd)
1320 {
1321         free(cd);
1322
1323         return 0;
1324 }
1325
1326 size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1327                                                    size_t *__restrict inbytesleft,
1328                                                    char **__restrict outbuf,
1329                                                    size_t *__restrict outbytesleft)
1330 {
1331         _UC_iconv_t *px = (_UC_iconv_t *) cd;
1332         size_t nrcount, r;
1333         wchar_t wc, wc2;
1334         int inci, inco;
1335
1336         assert(px != (_UC_iconv_t *)(-1));
1337         assert(sizeof(wchar_t) == 4);
1338
1339         if (!inbuf || !*inbuf) {        /* Need to reinitialze conversion state. */
1340                 /* Note: For shift-state encodings we possibly need to output the
1341                  * shift sequence to return to initial state! */
1342                 if ((px->fromcodeset & 0xf0) == 0xe0) {
1343                 }
1344                 px->tostate.__mask = px->fromstate.__mask = 0;
1345                 px->fromcodeset = px->fromcodeset0;
1346                 px->tobom = px->tobom0;
1347                 px->frombom = px->frombom0;
1348                 return 0;
1349         }
1350
1351         nrcount = 0;
1352         while (*inbytesleft) {
1353                 if (!*outbytesleft) {
1354                 TOO_BIG:
1355                         __set_errno(E2BIG);
1356                         return (size_t) -1;
1357                 }
1358
1359                 inci = inco = 1;
1360                 if (px->fromcodeset >= IC_MULTIBYTE) {
1361                         inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1362                         if (*inbytesleft < inci) goto INVALID;
1363                         wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1364                                 + ((unsigned char)((*inbuf)[1]));
1365                         if (inci == 4) {
1366                                 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1367                                         + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1368                                 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1369                         } else {
1370                                 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1371                                 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1372                                          && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1373                                         ) {                     /* surrogate */
1374                                         wc =- 0xd800U;
1375                                         if (*inbytesleft < 4) goto INVALID;
1376                                         wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1377                                                 + ((unsigned char)((*inbuf)[3]));
1378                                         if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1379                                         if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1380                                                 goto ILLEGAL;
1381                                         }
1382                                         inci = 4;       /* Change inci here in case skipping illegals. */
1383                                         wc = 0x10000UL + (wc << 10) + wc2;
1384                                 }
1385                         }
1386
1387                         if (px->frombom) {
1388                                 px->frombom = 0;
1389                                 if ((wc == 0xfeffU)
1390                                         || (wc == ((inci == 4)
1391                                                            ? (((wchar_t) 0xfffe0000UL))
1392                                                            : ((wchar_t)(0xfffeUL))))
1393                                         ) {
1394                                         if (wc != 0xfeffU) {
1395                                                 px->fromcodeset ^= 1; /* toggle endianness */
1396                                                 wc = 0xfeffU;
1397                                         }
1398                                         if (!px->frombom) {
1399                                                 goto BOM_SKIP_OUTPUT;
1400                                         }
1401                                         goto GOT_BOM;
1402                                 }
1403                         }
1404
1405                         if (px->fromcodeset != IC_WCHAR_T) {
1406                                 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1407                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1408 #ifdef KUHN
1409                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1410                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1411 #endif
1412                                         ) {
1413                                         goto ILLEGAL;
1414                                 }
1415                         }
1416                 } else if (px->fromcodeset == IC_UTF_8) {
1417                         const char *p = *inbuf;
1418                         r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1419                         if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1420                                 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1421                                         assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1422                                         if (r == (size_t)(-2)) {
1423                                         INVALID:
1424                                                 __set_errno(EINVAL);
1425                                         } else {
1426                                                 px->fromstate.__mask = 0;
1427                                                 inci = 1;
1428                                         ILLEGAL:
1429                                                 if (px->skip_invalid_input) {
1430                                                         px->skip_invalid_input = 2;     /* flag for iconv utility */
1431                                                         goto BOM_SKIP_OUTPUT;
1432                                                 }
1433                                                 __set_errno(EILSEQ);
1434                                         }
1435                                         return (size_t)(-1);
1436                                 }
1437 #ifdef __UCLIBC_MJN3_ONLY__
1438 #warning TODO: optimize this.
1439 #endif
1440                                 if (p != NULL) { /* incomplete char case */
1441                                         goto INVALID;
1442                                 }
1443                                 p = *inbuf + 1; /* nul */
1444                         }
1445                         inci = p - *inbuf;
1446                 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1447                         if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1448                                 goto ILLEGAL;
1449                         } else {                        /* some other 8-bit ascii-extension codeset */
1450                                 const __codeset_8_bit_t *c8b
1451                                         = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1452                                 wc -= 0x80;
1453                                 wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
1454                                                          (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1455                                                           << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1456                                 if (!wc) {
1457                                         goto ILLEGAL;
1458                                 }
1459                         }
1460                 }
1461
1462
1463                 if (px->tobom) {
1464                         inci = 0;
1465                         wc = 0xfeffU;
1466         GOT_BOM:
1467                         px->tobom = 0;
1468                 }
1469
1470                 if (px->tocodeset >= IC_MULTIBYTE) {
1471                         inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1472                         if (*outbytesleft < inco) goto TOO_BIG;
1473                         if (px->tocodeset != IC_WCHAR_T) {
1474                                 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1475                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1476 #ifdef KUHN
1477                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1478                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1479 #endif
1480                                         ) {
1481                                 REPLACE_32:
1482                                         wc = 0xfffd;
1483                                         ++nrcount;
1484                                 }
1485                         }
1486                         if (inco == 4) {
1487                                 if (px->tocodeset & 1) wc = bswap_32(wc);
1488                         } else {
1489                                 if (((__uwchar_t)wc ) > 0xffffU) {
1490                                         if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1491                                                 goto REPLACE_32;
1492                                         }
1493                                         if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1494                                         wc2 = 0xdc00U + (wc & 0x3ff);
1495                                         wc = 0xd800U + ((wc >> 10) & 0x3ff);
1496                                         if (px->tocodeset & 1) {
1497                                                 wc = bswap_16(wc);
1498                                                 wc2 = bswap_16(wc2);
1499                                         }
1500                                         wc += (wc2 << 16);
1501                                 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1502                         }
1503                         (*outbuf)[0] = (char)((unsigned char)(wc));
1504                         (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1505                         if (inco == 4) {
1506                                 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1507                                 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1508                         }
1509                 } else if (px->tocodeset == IC_UTF_8) {
1510                         const wchar_t *pw = &wc;
1511                         do {
1512                                 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1513                                 if (r != (size_t)(-1)) {
1514 #ifdef __UCLIBC_MJN3_ONLY__
1515 #warning TODO: What happens for a nul?
1516 #endif
1517                                         if (r == 0) {
1518                                                 if (wc != 0) {
1519                                                         goto TOO_BIG;
1520                                                 }
1521                                                 ++r;
1522                                         }
1523                                         break;
1524                                 }
1525                                 wc = 0xfffdU;
1526                                 ++nrcount;
1527                         } while (1);
1528                         inco = r;
1529                 } else if (((__uwchar_t)(wc)) < 0x80) {
1530                 CHAR_GOOD:
1531                                 **outbuf = wc;
1532                 } else {
1533                         if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1534                                 const __codeset_8_bit_t *c8b
1535                                         = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1536                                 __uwchar_t u;
1537                                 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1538                                 u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1539                                                  + ((wc >> Cwc2c_TT_SHIFT)
1540                                                         & ((1 << Cwc2c_TI_SHIFT)-1))];
1541                                 wc = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
1542                                                  + (u << Cwc2c_TT_SHIFT)
1543                                                  + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1544                                 if (wc) {
1545                                         goto CHAR_GOOD;
1546                                 }
1547                         }
1548                         **outbuf = '?';
1549                         ++nrcount;
1550                 }
1551
1552                 *outbuf += inco;
1553                 *outbytesleft -= inco;
1554         BOM_SKIP_OUTPUT:
1555                 *inbuf += inci;
1556                 *inbytesleft -= inci;
1557         }
1558         return nrcount;
1559 }
1560
1561 #endif
1562 /**********************************************************************/
1563 #ifdef L_iconv_main
1564
1565 #include <stdio.h>
1566 #include <stdlib.h>
1567 #include <string.h>
1568 #include <wchar.h>
1569 #include <iconv.h>
1570 #include <stdarg.h>
1571 #include <libgen.h>
1572
1573 extern const unsigned char __iconv_codesets[];
1574
1575 #define IBUF BUFSIZ
1576 #define OBUF BUFSIZ
1577
1578 char *progname;
1579 int hide_errors;
1580
1581 static void error_msg(const char *fmt, ...)
1582          __attribute__ ((noreturn, format (printf, 1, 2)));
1583
1584 static void error_msg(const char *fmt, ...)
1585 {
1586         va_list arg;
1587
1588         if (!hide_errors) {
1589                 fprintf(stderr, "%s: ", progname);
1590                 va_start(arg, fmt);
1591                 vfprintf(stderr, fmt, arg);
1592                 va_end(arg);
1593         }
1594
1595         exit(EXIT_FAILURE);
1596 }
1597
1598 int main(int argc, char **argv)
1599 {
1600         FILE *ifile;
1601         FILE *ofile = stdout;
1602         const char *p;
1603         const char *s;
1604         static const char opt_chars[] = "tfocsl";
1605                                       /* 012345 */
1606         const char *opts[sizeof(opt_chars)]; /* last is infile name */
1607         iconv_t ic;
1608         char ibuf[IBUF];
1609         char obuf[OBUF];
1610         char *pi;
1611         char *po;
1612         size_t ni, no, r, pos;
1613
1614         hide_errors = 0;
1615
1616         for (s = opt_chars ; *s ; s++) {
1617                 opts[ s - opt_chars ] = NULL;
1618         }
1619
1620         progname = *argv;
1621         while (--argc) {
1622                 p = *++argv;
1623                 if ((*p != '-') || (*++p == 0)) {
1624                         break;
1625                 }
1626                 do {
1627                         if ((s = strchr(opt_chars,*p)) == NULL) {
1628                         USAGE:
1629                                 s = basename(progname);
1630                                 fprintf(stderr,
1631                                                 "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
1632                                                 "  or\n%s -l\n", s, s);
1633                                 return EXIT_FAILURE;
1634                         }
1635                         if ((s - opt_chars) < 3) {
1636                                 if ((--argc == 0) || opts[s - opt_chars]) {
1637                                         goto USAGE;
1638                                 }
1639                                 opts[s - opt_chars] = *++argv;
1640                         } else {
1641                                 opts[s - opt_chars] = p;
1642                         }
1643                 } while (*++p);
1644         }
1645
1646         if (opts[5]) {                          /* -l */
1647                 fprintf(stderr, "Recognized codesets:\n");
1648                 for (s = __iconv_codesets ; *s ; s += *s) {
1649                         fprintf(stderr,"  %s\n", s+2);
1650                 }
1651                 s = __LOCALE_DATA_CODESET_LIST;
1652                 do {
1653                         fprintf(stderr,"  %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s));
1654                 } while (*++s);
1655
1656                 return EXIT_SUCCESS;
1657         }
1658
1659         if (opts[4]) {
1660                 hide_errors = 1;
1661         }
1662
1663         if (!opts[0] || !opts[1]) {
1664                 goto USAGE;
1665         }
1666         if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
1667                 error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
1668         }
1669         if (opts[3]) {                          /* -c */
1670                 ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
1671         }
1672
1673         if ((s = opts[2]) != NULL) {
1674                 if (!(ofile = fopen(s, "w"))) {
1675                         error_msg( "couldn't open %s for writing\n", s);
1676                 }
1677         }
1678
1679         pos = ni = 0;
1680         do {
1681                 if (!argc || ((**argv == '-') && !((*argv)[1]))) {
1682                         ifile = stdin;          /* we don't check for duplicates */
1683                 } else if (!(ifile = fopen(*argv, "r"))) {
1684                         error_msg( "couldn't open %s for reading\n", *argv);
1685                 }
1686
1687                 while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
1688                         pos += r;
1689                         ni += r;
1690                         no = OBUF;
1691                         pi = ibuf;
1692                         po = obuf;
1693                         if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
1694                                 if ((errno != EINVAL) && (errno != E2BIG)) {
1695                                         error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
1696                                 }
1697                         }
1698                         if ((r = OBUF - no) > 0) {
1699                                 if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
1700                                         error_msg( "write error\n");
1701                                 }
1702                         }
1703                         if (ni) {                       /* still bytes in buffer! */
1704                                 memmove(ibuf, pi, ni);
1705                         }
1706                 }
1707
1708                 if (ferror(ifile)) {
1709                         error_msg( "read error\n");
1710                 }
1711
1712                 ++argv;
1713
1714                 if (ifile != stdin) {
1715                         fclose(ifile);
1716                 }
1717
1718         } while (--argc > 0);
1719
1720         iconv_close(ic);
1721
1722         if (ni) {
1723                 error_msg( "incomplete sequence\n");
1724         }
1725
1726         return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
1727                 ? EXIT_SUCCESS : EXIT_FAILURE;
1728 }
1729
1730 #endif
1731 /**********************************************************************/