libc/misc/wchar/wchar.c

   1
   2 /*  Copyright (C) 2002     Manuel Novoa III
   3  *
   4  *  This library is free software; you can redistribute it and/or
   5  *  modify it under the terms of the GNU Library General Public
   6  *  License as published by the Free Software Foundation; either
   7  *  version 2 of the License, or (at your option) any later version.
   8  *
   9  *  This library is distributed in the hope that it will be useful,
  10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  *  Library General Public License for more details.
  13  *
  14  *  You should have received a copy of the GNU Library General Public
  15  *  License along with this library; if not, write to the Free
  16  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17  */
  18
  19 /*  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!
  20  *
  21  *  Besides uClibc, I'm using this code in my libc for elks, which is
  22  *  a 16-bit environment with a fairly limited compiler.  It would make
  23  *  things much easier for me if this file isn't modified unnecessarily.
  24  *  In particular, please put any new or replacement functions somewhere
  25  *  else, and modify the makefile to use your version instead.
  26  *  Thanks.  Manuel
  27  *
  28  *  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION! */
  29
  30
  31 /* May 23, 2002     Initial Notes:
  32  *
  33  * I'm still tweaking this stuff, but it passes the tests I've thrown
  34  * at it, and Erik needs it for the gcc port.  The glibc extension
  35  * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
  36  * in the glibc source.  I also need to fix the behavior of
  37  * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
  38  *
  39  * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
  40  * file on my platform (x86) show about 5-10% faster conversion speed than
  41  * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
  42  * individual mbrtowc()/wcrtomb() calls.
  43  *
  44  * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
  45  * as a fail-safe UTF-8 decoder appropriate for a terminal, etc.  which
  46  * needs to deal gracefully with whatever is sent to it.  In that mode,
  47  * it passes Markus Kuhn's UTF-8-test.txt stress test.  I plan to add
  48  * an arg to force that behavior, so the interface will be changing.
  49  *
  50  * I need to fix the error checking for 16-bit wide chars.  This isn't
  51  * an issue for uClibc, but may be for ELKS.  I'm currently not sure
  52  * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
  53  *
  54  * July 1, 2002
  55  *
  56  * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
  57  * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
  58  *    locales.
  59  * Enabled building of a C/POSIX-locale-only version, so full locale support
  60  *    no longer needs to be enabled.
  61  *
  62  * Nov 4, 2002
  63  *
  64  * Fixed a bug in _wchar_wcsntoutf8s().  Don't store wcs position if dst is NULL.
  65  * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
  66  *   order to support %ls in printf.  See comments below for details.
  67  * Change behaviour of wc<->mb functions when in the C locale.  Now they do
  68  *   a 1-1 map for the range 0x80-UCHAR_MAX.  This is for backwards compatibility
  69  *   and consistency with the stds requirements that a printf format string by
  70  *   a valid multibyte string beginning and ending in it's initial shift state.
  71  *
  72  * Nov 5, 2002
  73  *
  74  * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
  75  *
  76  * Nov 7, 2002
  77  *
  78  * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
  79  *   Added some size/speed optimizations and integrated it into my locale
  80  *   framework.  Minimally tested at the moment, but the stub C-locale
  81  *   version (which most people would probably be using) should be fine.
  82  *
  83  * Nov 21, 2002
  84  *
  85  * Revert the wc<->mb changes from earlier this month involving the C-locale.
  86  * Add a couple of ugly hacks to support *wprintf.
  87  * Add a mini iconv() and iconv implementation (requires locale support).
  88  *
  89  * Manuel
  90  */
  91
  92 #define _GNU_SOURCE
  93 #define _ISOC99_SOURCE
  94 #include <errno.h>
  95 #include <stddef.h>
  96 #include <limits.h>
  97 #include <stdint.h>
  98 #include <inttypes.h>
  99 #include <stdlib.h>
 100 #include <stdio.h>
 101 #include <assert.h>
 102 #include <locale.h>
 103 #include <wchar.h>
 104
 105 #ifdef __UCLIBC_HAS_LOCALE__
 106 #define ENCODING (__global_locale.encoding)
 107 #ifndef __CTYPE_HAS_UTF_8_LOCALES
 108 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
 109 #endif
 110 #else
 111 #define ENCODING (__ctype_encoding_7_bit)
 112 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 113 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
 114 #endif
 115 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 116 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
 117 #endif
 118 #undef L__wchar_utf8sntowcs
 119 #undef L__wchar_wcsntoutf8s
 120 #endif
 121
 122 #if WCHAR_MAX > 0xffffUL
 123 #define UTF_8_MAX_LEN 6
 124 #else
 125 #define UTF_8_MAX_LEN 3
 126 #endif
 127
 128 #define KUHN 1
 129
 130 /* Implementation-specific work functions. */
 131
 132 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 133                                                                  const char **__restrict src, size_t n,
 134                                                                  mbstate_t *ps, int allow_continuation);
 135
 136 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 137                                                                  const wchar_t **__restrict src, size_t wn);
 138
 139 /* glibc extensions. */
 140
 141 extern size_t __mbsnrtowcs(wchar_t *__restrict dst,
 142                                                    const char **__restrict src,
 143                                                    size_t NMC, size_t len, mbstate_t *__restrict ps);
 144
 145 extern size_t __wcsnrtombs(char *__restrict dst,
 146                                                    const wchar_t **__restrict src,
 147                                                    size_t NWC, size_t len, mbstate_t *__restrict ps);
 148
 149 /**********************************************************************/
 150 #ifdef L_btowc
 151
 152 wint_t btowc(int c)
 153 {
 154 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 155
 156         wchar_t wc;
 157         unsigned char buf[1];
 158         mbstate_t mbstate;
 159
 160         if (c != EOF) {
 161                 *buf = (unsigned char) c;
 162                 mbstate.mask = 0;               /* Initialize the mbstate. */
 163                 if (mbrtowc(&wc, buf, 1, &mbstate) <= 1) {
 164                         return wc;
 165                 }
 166         }
 167         return WEOF;
 168
 169 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
 170
 171 #ifdef __UCLIBC_HAS_LOCALE__
 172         assert((ENCODING == __ctype_encoding_7_bit)
 173                    || (ENCODING == __ctype_encoding_utf8));
 174 #endif /* __UCLIBC_HAS_LOCALE__ */
 175
 176         /* If we don't have 8-bit locale support, then this is trivial since
 177          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 178         return (((unsigned int)c) < 0x80) ? c : WEOF;
 179
 180 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
 181 }
 182
 183 #endif
 184 /**********************************************************************/
 185 #ifdef L_wctob
 186
 187 /* Note: We completely ignore ps in all currently supported conversions. */
 188
 189 int wctob(wint_t c)
 190 {
 191 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 192
 193         unsigned char buf[MB_LEN_MAX];
 194
 195         return (wcrtomb(buf, c, NULL) == 1) ? *buf : EOF;
 196
 197 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
 198
 199 #ifdef __UCLIBC_HAS_LOCALE__
 200         assert((ENCODING == __ctype_encoding_7_bit)
 201                    || (ENCODING == __ctype_encoding_utf8));
 202 #endif /* __UCLIBC_HAS_LOCALE__ */
 203
 204         /* If we don't have 8-bit locale support, then this is trivial since
 205          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 206
 207         /* TODO: need unsigned version of wint_t... */
 208 /*      return (((unsigned int)c) < 0x80) ? c : WEOF; */
 209         return ((c >= 0) && (c < 0x80)) ? c : EOF;
 210
 211 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
 212 }
 213
 214 #endif
 215 /**********************************************************************/
 216 #ifdef L_mbsinit
 217
 218 int mbsinit(const mbstate_t *ps)
 219 {
 220         return !ps || !ps->mask;
 221 }
 222
 223 #endif
 224 /**********************************************************************/
 225 #ifdef L_mbrlen
 226
 227 size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
 228          __attribute__ ((__weak__, __alias__("__mbrlen")));
 229
 230 size_t __mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
 231 {
 232         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 233
 234         return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
 235 }
 236
 237 #endif
 238 /**********************************************************************/
 239 #ifdef L_mbrtowc
 240
 241 size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
 242                            size_t n, mbstate_t *__restrict ps)
 243 {
 244         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 245         wchar_t wcbuf[1];
 246         const char *p;
 247         size_t r;
 248         char empty_string[1];           /* Avoid static to be fPIC friendly. */
 249
 250         if (!ps) {
 251                 ps = &mbstate;
 252         }
 253
 254         if (!s) {
 255                 pwc = (wchar_t *) s;    /* NULL */
 256                 empty_string[0] = 0;    /* Init the empty string when necessary. */
 257                 s = empty_string;
 258                 n = 1;
 259         } else if (!n) {
 260                 return (ps->mask && (ps->wc == 0xffffU)) /* TODO: change error code? */
 261                         ? ((size_t) -1) : ((size_t) -2);
 262         }
 263
 264         p = s;
 265
 266 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 267         /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
 268         if (ENCODING == __ctype_encoding_utf8) {
 269                 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
 270                 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
 271         }
 272 #endif
 273
 274         r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
 275
 276         if (((ssize_t) r) >= 0) {
 277                 if (pwc) {
 278                         *pwc = *wcbuf;
 279                 }
 280         }
 281         return (size_t) r;
 282 }
 283
 284 #endif
 285 /**********************************************************************/
 286 #ifdef L_wcrtomb
 287
 288 /* Note: We completely ignore ps in all currently supported conversions. */
 289 /* TODO: Check for valid state anyway? */
 290
 291 size_t wcrtomb(register char *__restrict s, wchar_t wc,
 292                            mbstate_t *__restrict ps)
 293 {
 294         wchar_t wcbuf[2];
 295         const wchar_t *pwc;
 296         size_t r;
 297         char buf[MB_LEN_MAX];
 298
 299         if (!s) {
 300                 s = buf;
 301                 wc = 0;
 302         }
 303
 304         pwc = wcbuf;
 305         wcbuf[0] = wc;
 306         wcbuf[1] = 0;
 307
 308         r = __wcsnrtombs(s, &pwc, SIZE_MAX, MB_LEN_MAX, ps);
 309         return (r != 0) ? r : 1;
 310 }
 311
 312 #endif
 313 /**********************************************************************/
 314 #ifdef L_mbsrtowcs
 315
 316 size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 317                                  size_t len, mbstate_t *__restrict ps)
 318 {
 319         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 320
 321         return __mbsnrtowcs(dst, src, SIZE_MAX, len,
 322                                                 ((ps != NULL) ? ps : &mbstate));
 323 }
 324
 325 #endif
 326 /**********************************************************************/
 327 #ifdef L_wcsrtombs
 328
 329 /* Note: We completely ignore ps in all currently supported conversions.
 330
 331  * TODO: Check for valid state anyway? */
 332
 333 size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
 334                                  size_t len, mbstate_t *__restrict ps)
 335 {
 336         return __wcsnrtombs(dst, src, SIZE_MAX, len, ps);
 337 }
 338
 339 #endif
 340 /**********************************************************************/
 341 #ifdef L__wchar_utf8sntowcs
 342
 343 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
 344  * UTF-8-test.txt strss test.
 345  */
 346 /*  #define DECODER */
 347
 348 #ifdef DECODER
 349 #ifndef KUHN
 350 #define KUHN
 351 #endif
 352 #endif
 353
 354 size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 355                                                   const char **__restrict src, size_t n,
 356                                                   mbstate_t *ps, int allow_continuation)
 357 {
 358         register const char *s;
 359         __uwchar_t mask;
 360         __uwchar_t wc;
 361         wchar_t wcbuf[1];
 362         size_t count;
 363         int incr;
 364
 365         s = *src;
 366
 367         assert(s != NULL);
 368         assert(ps != NULL);
 369
 370         incr = 1;
 371         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 372          * wprintf, we need to be able to compute the number of wchars needed
 373          * for the mbs conversion, not to exceed the precision specified.
 374          * But if dst is NULL, the return value is the length assuming a
 375          * sufficiently sized buffer.  So, we allow passing of (wchar_t *) ps
 376          * as pwc in order to flag that we really want the length, subject
 377          * to the restricted buffer size and no partial conversions.
 378          * See mbsnrtowcs() as well. */
 379         if (!pwc || (pwc == ((wchar_t *)ps))) {
 380                 if (!pwc) {
 381                         wn = SIZE_MAX;
 382                 }
 383                 pwc = wcbuf;
 384                 incr = 0;
 385         }
 386
 387         /* This is really here only to support the glibc extension function
 388          * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
 389          * check on the validity of the mbstate. */
 390         if (!(count = wn)) {
 391                 return 0;
 392         }
 393
 394         if ((mask = (__uwchar_t) ps->mask) != 0) { /* A continuation... */
 395 #ifdef DECODER
 396                 wc = (__uwchar_t) ps->wc;
 397                 if (n) {
 398                         goto CONTINUE;
 399                 }
 400                 goto DONE;
 401 #else
 402                 if ((wc = (__uwchar_t) ps->wc) != 0xffffU) {
 403                         /* TODO: change error code here and below? */
 404                         if (n) {
 405                                 goto CONTINUE;
 406                         }
 407                         goto DONE;
 408                 }
 409                 return (size_t) -1;             /* We're in an error state. */
 410 #endif
 411         }
 412
 413         do {
 414                 if (!n) {
 415                         goto DONE;
 416                 }
 417                 --n;
 418                 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
 419                         mask = 0x40;
 420 #ifdef __UCLIBC_MJN3_ONLY__
 421 #warning fix range for 16 bit wides
 422 #endif
 423                         if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) {
 424                                 goto START;
 425                         }
 426                 BAD:
 427 #ifdef DECODER
 428                         wc = 0xfffdU;
 429                         goto COMPLETE;
 430 #else
 431                         ps->mask = mask;
 432                         ps->wc = 0xffffU;
 433                         return (size_t) -1;     /* Illegal start byte! */
 434 #endif
 435
 436                 CONTINUE:
 437                         while (n) {
 438                                 --n;
 439                                 if ((*s & 0xc0) != 0x80) {
 440                                         goto BAD;
 441                                 }
 442                                 mask <<= 5;
 443                                 wc <<= 6;
 444                                 wc += (*s & 0x3f);      /* keep seperate for bcc (smaller code) */
 445                                 ++s;
 446                         START:
 447                                 wc &= ~(mask << 1);
 448
 449                                 if ((wc & mask) == 0) { /* Character completed. */
 450                                         if ((mask >>= 5) == 0x40) {
 451                                                 mask += mask;
 452                                         }
 453                                         /* Check for invalid sequences (longer than necessary)
 454                                          * and invalid chars.  */
 455                                         if ( (wc < mask) /* Sequence not minimal length. */
 456 #ifdef KUHN
 457 #if UTF_8_MAX_LEN == 3
 458 #error broken since mask can overflow!!
 459                                                  /* For plane 0, these are the only defined values.*/
 460                                                  || (wc > 0xfffdU)
 461 #else
 462                                                  /* Note that we don't need to worry about exceeding */
 463                                                  /* 31 bits as that is the most that UTF-8 provides. */
 464                                                  || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 465 #endif
 466                                                  || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 467 #endif /* KUHN */
 468                                                  ) {
 469                                                 goto BAD;
 470                                         }
 471                                         goto COMPLETE;
 472                                 }
 473                         }
 474                         /* Character potentially valid but incomplete. */
 475                         if (!allow_continuation) {
 476                                 if (count != wn) {
 477                                         return 0;
 478                                 }
 479                                 /* NOTE: The following can fail if you allow and then disallow
 480                                  * continuation!!! */
 481 #if UTF_8_MAX_LEN == 3
 482 #error broken since mask can overflow!!
 483 #endif
 484                                 /* Need to back up... */
 485                                 do {
 486                                         --s;
 487                                 } while ((mask >>= 5) >= 0x40);
 488                                 goto DONE;
 489                         }
 490                         ps->mask = (wchar_t) mask;
 491                         ps->wc = (wchar_t) wc;
 492                         *src = s;
 493                         return (size_t) -2;
 494                 }
 495         COMPLETE:
 496                 *pwc = wc;
 497                 pwc += incr;
 498
 499         }
 500 #ifdef DECODER
 501         while (--count);
 502 #else
 503         while (wc && --count);
 504
 505         if (!wc) {
 506                 s = NULL;
 507         }
 508 #endif
 509
 510  DONE:
 511         /* ps->wc is irrelavent here. */
 512         ps->mask = 0;
 513         if (pwc != wcbuf) {
 514                 *src = s;
 515         }
 516
 517         return wn - count;
 518 }
 519
 520 #endif
 521 /**********************************************************************/
 522 #ifdef L__wchar_wcsntoutf8s
 523
 524 size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 525                                                   const wchar_t **__restrict src, size_t wn)
 526 {
 527         register char *p;
 528         size_t len, t;
 529         __uwchar_t wc;
 530         const __uwchar_t *swc;
 531         int store;
 532         char buf[MB_LEN_MAX];
 533         char m;
 534
 535         store = 1;
 536         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 537          * printf, we need to be able to compute the number of bytes needed
 538          * for the mbs conversion, not to exceed the precision specified.
 539          * But if dst is NULL, the return value is the length assuming a
 540          * sufficiently sized buffer.  So, we allow passing of (char *) src
 541          * as dst in order to flag that we really want the length, subject
 542          * to the restricted buffer size and no partial conversions.
 543          * See wcsnrtombs() as well. */
 544         if (!s || (s == ((char *) src))) {
 545                 if (!s) {
 546                         n = SIZE_MAX;
 547                 }
 548             s = buf;
 549                 store = 0;
 550         }
 551
 552         t = n;
 553         swc = (const __uwchar_t *) *src;
 554
 555         assert(swc != NULL);
 556
 557         while (wn && t) {
 558                 wc = *swc;
 559
 560                 *s = wc;
 561                 len = 1;
 562
 563                 if (wc >= 0x80) {
 564 #ifdef KUHN
 565                         if (
 566 #if UTF_8_MAX_LEN == 3
 567                                 /* For plane 0, these are the only defined values.*/
 568                                 /* Note that we don't need to worry about exceeding */
 569                                 /* 31 bits as that is the most that UTF-8 provides. */
 570                                 (wc > 0xfffdU)
 571 #else
 572                                 /* UTF_8_MAX_LEN == 6 */
 573                                 (wc > 0x7fffffffUL)
 574                                 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 575 #endif
 576                                 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 577                                 ) {
 578                                 return (size_t) -1;
 579                         }
 580 #else  /* KUHN */
 581 #if UTF_8_MAX_LEN != 3
 582                         if (wc > 0x7fffffffUL) { /* Value too large. */
 583                                 return (size_t) -1;
 584                         }
 585 #endif
 586 #endif /* KUHN */
 587
 588                         wc >>= 1;
 589                         p = s;
 590                         do {
 591                                 ++p;
 592                         } while (wc >>= 5);
 593                         wc = *swc;
 594                         if ((len = p - s) > t) { /* Not enough space. */
 595                                 break;
 596                         }
 597
 598                         m = 0x80;
 599                         while( p>s ) {
 600                                 m = (m >> 1) | 0x80;
 601                                 *--p = (wc & 0x3f) | 0x80;
 602                                 wc >>= 6;
 603                         }
 604                         *s |= (m << 1);
 605                 } else if (wc == 0) {   /* End of string. */
 606                         swc = NULL;
 607                         break;
 608                 }
 609
 610                 ++swc;
 611                 --wn;
 612                 t -= len;
 613                 if (store) {
 614                         s += len;
 615                 }
 616         }
 617
 618         if (store) {
 619                 *src = (const wchar_t *) swc;
 620         }
 621
 622         return n - t;
 623 }
 624
 625
 626 #endif
 627 /**********************************************************************/
 628 #ifdef L___mbsnrtowcs
 629
 630 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 631
 632 size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 633                                   size_t NMC, size_t len, mbstate_t *__restrict ps)
 634          __attribute__ ((__weak__, __alias__("__mbsnrtowcs")));
 635
 636 size_t __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 637                                         size_t NMC, size_t len, mbstate_t *__restrict ps)
 638 {
 639         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 640         wchar_t wcbuf[1];
 641         const char *s;
 642         size_t count;
 643         int incr;
 644
 645         if (!ps) {
 646                 ps = &mbstate;
 647         }
 648
 649 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 650         if (ENCODING == __ctype_encoding_utf8) {
 651                 size_t r;
 652                 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
 653                                 != (size_t) -2) ? r : 0;
 654         }
 655 #endif
 656         incr = 1;
 657         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 658          * wprintf, we need to be able to compute the number of wchars needed
 659          * for the mbs conversion, not to exceed the precision specified.
 660          * But if dst is NULL, the return value is the length assuming a
 661          * sufficiently sized buffer.  So, we allow passing of ((wchar_t *)ps)
 662          * as dst in order to flag that we really want the length, subject
 663          * to the restricted buffer size and no partial conversions.
 664          * See _wchar_utf8sntowcs() as well. */
 665         if (!dst || (dst == ((wchar_t *)ps))) {
 666                 if (!dst) {
 667                         len = SIZE_MAX;
 668                 }
 669                 dst = wcbuf;
 670                 incr = 0;
 671         }
 672
 673         /* Since all the following encodings are single-byte encodings... */
 674         if (len > NMC) {
 675                 len = NMC;
 676         }
 677
 678         count = len;
 679         s = *src;
 680
 681 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 682         if (ENCODING == __ctype_encoding_8_bit) {
 683                 wchar_t wc;
 684                 while (count) {
 685                         if ((wc = ((unsigned char)(*s))) >= 0x80) {     /* Non-ASCII... */
 686                                 wc -= 0x80;
 687                                 wc = __global_locale.tbl8c2wc[
 688                                                   (__global_locale.idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
 689                                                    << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
 690                                 if (!wc) {
 691                                         goto BAD;
 692                                 }
 693                         }
 694                         if (!(*dst = wc)) {
 695                                 s = NULL;
 696                                 break;
 697                         }
 698                         dst += incr;
 699                         ++s;
 700                         --count;
 701                 }
 702                 if (dst != wcbuf) {
 703                         *src = s;
 704                 }
 705                 return len - count;
 706         }
 707 #endif
 708
 709 #ifdef __UCLIBC_HAS_LOCALE__
 710         assert(ENCODING == __ctype_encoding_7_bit);
 711 #endif
 712
 713         while (count) {
 714                 if ((*dst = (unsigned char) *s) == 0) {
 715                         s = NULL;
 716                         break;
 717                 }
 718                 if (*dst >= 0x80) {
 719 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 720                 BAD:
 721 #endif
 722                         __set_errno(EILSEQ);
 723                         return (size_t) -1;
 724                 }
 725                 ++s;
 726                 dst += incr;
 727                 --count;
 728         }
 729         if (dst != wcbuf) {
 730                 *src = s;
 731         }
 732         return len - count;
 733 }
 734
 735 #endif
 736 /**********************************************************************/
 737 #ifdef L___wcsnrtombs
 738
 739 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 740
 741 /* Note: We completely ignore ps in all currently supported conversions.
 742  * TODO: Check for valid state anyway? */
 743
 744 size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
 745                                   size_t NWC, size_t len, mbstate_t *__restrict ps)
 746          __attribute__ ((__weak__, __alias__("__wcsnrtombs")));
 747
 748 size_t __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
 749                                         size_t NWC, size_t len, mbstate_t *__restrict ps)
 750 {
 751         const __uwchar_t *s;
 752         size_t count;
 753         int incr;
 754         char buf[MB_LEN_MAX];
 755
 756 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 757         if (ENCODING == __ctype_encoding_utf8) {
 758                 return _wchar_wcsntoutf8s(dst, len, src, NWC);
 759         }
 760 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
 761
 762         incr = 1;
 763         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 764          * printf, we need to be able to compute the number of bytes needed
 765          * for the mbs conversion, not to exceed the precision specified.
 766          * But if dst is NULL, the return value is the length assuming a
 767          * sufficiently sized buffer.  So, we allow passing of (char *) src
 768          * as dst in order to flag that we really want the length, subject
 769          * to the restricted buffer size and no partial conversions.
 770          * See _wchar_wcsntoutf8s() as well. */
 771         if (!dst || (dst == ((char *) src))) {
 772                 if (!dst) {
 773                         len = SIZE_MAX;
 774                 }
 775                 dst = buf;
 776                 incr = 0;
 777         }
 778
 779         /* Since all the following encodings are single-byte encodings... */
 780         if (len > NWC) {
 781                 len = NWC;
 782         }
 783
 784         count = len;
 785         s = (const __uwchar_t *) *src;
 786
 787 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 788         if (ENCODING == __ctype_encoding_8_bit) {
 789                 __uwchar_t wc;
 790                 __uwchar_t u;
 791                 while (count) {
 792                         if ((wc = *s) <= 0x7f) {
 793                                 if (!(*dst = (unsigned char) wc)) {
 794                                         s = NULL;
 795                                         break;
 796                                 }
 797                         } else {
 798                                 u = 0;
 799                                 if (wc <= Cwc2c_DOMAIN_MAX) {
 800                                         u = __global_locale.idx8wc2c[wc >> (Cwc2c_TI_SHIFT
 801                                                                                                                 + Cwc2c_TT_SHIFT)];
 802                                         u = __global_locale.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
 803                                                                         + ((wc >> Cwc2c_TT_SHIFT)
 804                                                                            & ((1 << Cwc2c_TI_SHIFT)-1))];
 805                                         u = __global_locale.tbl8wc2c[Cwc2c_TI_LEN
 806                                                                         + (u << Cwc2c_TT_SHIFT)
 807                                                                         + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
 808                                 }
 809
 810 #define __WCHAR_REPLACEMENT_CHAR '?'
 811 #ifdef __WCHAR_REPLACEMENT_CHAR
 812                                 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
 813 #else  /* __WCHAR_REPLACEMENT_CHAR */
 814                                 if (!u) {
 815                                         goto BAD;
 816                                 }
 817                                 *dst = (unsigned char) u;
 818 #endif /* __WCHAR_REPLACEMENT_CHAR */
 819                         }
 820                         ++s;
 821                         dst += incr;
 822                         --count;
 823                 }
 824                 if (dst != buf) {
 825                         *src = (const wchar_t *) s;
 826                 }
 827                 return len - count;
 828         }
 829 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
 830
 831 #ifdef __UCLIBC_HAS_LOCALE__
 832         assert(ENCODING == __ctype_encoding_7_bit);
 833 #endif
 834
 835         while (count) {
 836                 if (*s >= 0x80) {
 837 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
 838                 BAD:
 839 #endif
 840                         __set_errno(EILSEQ);
 841                         return (size_t) -1;
 842                 }
 843                 if ((*dst = (unsigned char) *s) == 0) {
 844                         s = NULL;
 845                         break;
 846                 }
 847                 ++s;
 848                 dst += incr;
 849                 --count;
 850         }
 851         if (dst != buf) {
 852                 *src = (const wchar_t *) s;
 853         }
 854         return len - count;
 855 }
 856
 857 #endif
 858 /**********************************************************************/
 859 #ifdef L_wcswidth
 860
 861 #ifdef __UCLIBC_MJN3_ONLY__
 862 #warning if we start doing translit, wcwidth and wcswidth will need updating.
 863 #endif
 864
 865 #if defined(__UCLIBC_HAS_LOCALE__) && \
 866 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
 867
 868 static const unsigned char new_idx[] = {
 869         0,    5,    5,    6,   10,   15,   28,   39,
 870         48,   48,   71,   94,  113,  128,  139,  154,
 871         175,  186,  188,  188,  188,  188,  188,  188,
 872         203,  208,  208,  208,  208,  208,  208,  208,
 873         208,  219,  219,  219,  222,  222,  222,  222,
 874         222,  222,  222,  222,  222,  222,  222,  224,
 875         224,  231,  231,  231,  231,  231,  231,  231,
 876         231,  231,  231,  231,  231,  231,  231,  231,
 877         231,  231,  231,  231,  231,  231,  231,  231,
 878         231,  231,  231,  231,  231,  231,  231,  231,
 879         231,  231,  231,  231,  231,  231,  231,  231,
 880         231,  231,  231,  231,  231,  231,  231,  231,
 881         231,  231,  231,  231,  231,  231,  231,  231,
 882         231,  231,  231,  231,  231,  231,  231,  231,
 883         231,  231,  231,  231,  231,  231,  231,  231,
 884         231,  231,  231,  231,  231,  231,  231,  231,
 885         231,  231,  231,  231,  231,  231,  231,  231,
 886         231,  231,  231,  231,  231,  231,  231,  231,
 887         231,  231,  231,  231,  231,  231,  231,  231,
 888         231,  231,  231,  231,  231,  231,  231,  231,
 889         231,  231,  231,  231,  231,  233,  233,  233,
 890         233,  233,  233,  233,  234,  234,  234,  234,
 891         234,  234,  234,  234,  234,  234,  234,  234,
 892         234,  234,  234,  234,  234,  234,  234,  234,
 893         234,  234,  234,  234,  234,  234,  234,  234,
 894         234,  234,  234,  234,  234,  234,  234,  234,
 895         234,  234,  234,  234,  234,  234,  234,  234,
 896         236,  236,  236,  236,  236,  236,  236,  236,
 897         236,  236,  236,  236,  236,  236,  236,  236,
 898         236,  236,  236,  236,  236,  236,  236,  236,
 899         236,  236,  236,  236,  236,  236,  236,  236,
 900         236,  237,  237,  238,  241,  241,  242,  249,
 901         255,
 902 };
 903
 904 static const unsigned char new_tbl[] = {
 905         0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
 906         0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
 907         0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
 908         0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
 909         0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
 910         0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
 911         0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
 912         0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
 913         0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
 914         0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
 915         0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
 916         0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
 917         0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
 918         0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
 919         0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
 920         0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
 921         0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
 922         0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
 923         0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
 924         0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
 925         0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
 926         0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
 927         0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
 928         0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
 929         0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
 930         0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
 931         0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
 932         0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
 933         0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
 934         0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
 935         0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
 936         0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
 937 };
 938
 939 static const signed char new_wtbl[] = {
 940         0,   -1,    1,   -1,    1,    1,    0,    1,
 941         0,    1,    1,    0,    1,    0,    1,    1,
 942         0,    1,    0,    1,    0,    1,    0,    1,
 943         0,    1,    0,    1,    1,    0,    1,    0,
 944         1,    0,    1,    0,    1,    0,    1,    1,
 945         0,    1,    0,    1,    0,    1,    0,    1,
 946         1,    0,    1,    0,    1,    0,    1,    0,
 947         1,    0,    1,    0,    1,    0,    1,    0,
 948         1,    0,    1,    0,    1,    0,    1,    1,
 949         0,    1,    0,    1,    0,    1,    0,    1,
 950         0,    1,    0,    1,    0,    1,    0,    1,
 951         0,    1,    0,    1,    0,    1,    1,    0,
 952         1,    0,    1,    0,    1,    0,    1,    0,
 953         1,    0,    1,    0,    1,    0,    1,    0,
 954         1,    1,    0,    1,    0,    1,    0,    1,
 955         0,    1,    0,    1,    0,    1,    0,    1,
 956         1,    0,    1,    0,    1,    0,    1,    0,
 957         1,    0,    1,    1,    0,    1,    0,    1,
 958         0,    1,    0,    1,    0,    1,    0,    1,
 959         0,    1,    1,    0,    1,    0,    1,    0,
 960         1,    0,    1,    0,    1,    0,    1,    0,
 961         1,    0,    1,    0,    1,    0,    1,    1,
 962         0,    1,    0,    1,    0,    1,    0,    1,
 963         0,    1,    2,    0,    1,    0,    1,    0,
 964         1,    0,    1,    0,    1,    0,    1,    0,
 965         1,    0,    1,    1,    0,    1,    0,    1,
 966         1,    0,    1,    0,    1,    0,    1,    0,
 967         1,    0,    1,    1,    2,    1,    1,    2,
 968         2,    0,    2,    1,    2,    0,    2,    2,
 969         1,    1,    2,    1,    1,    2,    1,    0,
 970         1,    1,    0,    1,    0,    1,    2,    1,
 971         0,    2,    1,    2,    1,    0,    1,
 972 };
 973
 974 int wcswidth(const wchar_t *pwcs, size_t n)
 975 {
 976     int h, l, m, count;
 977     wchar_t wc;
 978     unsigned char b;
 979
 980         if (ENCODING == __ctype_encoding_7_bit) {
 981                 size_t i;
 982
 983                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
 984                         if (pwcs[i] != ((unsigned char)(pwcs[i]))) {
 985                                 return -1;
 986                         }
 987                 }
 988         }
 989 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 990         else if (ENCODING == __ctype_encoding_8_bit) {
 991                 mbstate_t mbstate;
 992
 993                 mbstate.mask = 0;                       /* Initialize the mbstate. */
 994                 if (__wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
 995                         return -1;
 996                 }
 997         }
 998 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
 999 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1000         /* For stricter handling of allowed unicode values... see comments above. */
1001         else if (ENCODING == __ctype_encoding_utf8) {
1002                 size_t i;
1003
1004                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1005                         if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1006                                  || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1007                                 ) {
1008                                 return -1;
1009                         }
1010                 }
1011         }
1012 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1013
1014     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1015                 if (wc <= 0xff) {
1016                         /* If we're here, wc != 0. */
1017                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1018                                 return -1;
1019                         }
1020                         ++count;
1021                         continue;
1022                 }
1023                 if (((unsigned int) wc) <= 0xffff) {
1024                         b = wc & 0xff;
1025                         h = (wc >> 8);
1026                         l = new_idx[h];
1027                         h = new_idx[h+1];
1028                         while ((m = (l+h) >> 1) != l) {
1029                                 if (b >= new_tbl[m]) {
1030                                         l = m;
1031                                 } else {                /* wc < tbl[m] */
1032                                         h = m;
1033                                 }
1034                         }
1035                         count += new_wtbl[l]; /* none should be -1. */
1036                         continue;
1037                 }
1038
1039                 /* Redo this to minimize average number of compares?*/
1040                 if (wc >= 0x1d167) {
1041                         if (wc <= 0x1d1ad) {
1042                                 if ((wc <= 0x1d169
1043                                          || (wc >= 0x1d173
1044                                                  && (wc <= 0x1d182
1045                                                          || (wc >= 0x1d185
1046                                                                  && (wc <= 0x1d18b
1047                                                                          || (wc >= 0x1d1aa))))))
1048                                         ) {
1049                                         continue;
1050                                 }
1051                         } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1052                                 continue;
1053                         } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1054                                 ++count;                /* need 2.. add one here */
1055                         }
1056 #if (WCHAR_MAX > 0x7fffffffL)
1057                         else if (wc > 0x7fffffffL) {
1058                                 return -1;
1059                         }
1060 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1061                 }
1062
1063                 ++count;
1064     }
1065
1066     return count;
1067 }
1068
1069 #else  /*  __UCLIBC_HAS_LOCALE__ */
1070
1071 int wcswidth(const wchar_t *pwcs, size_t n)
1072 {
1073         int count;
1074         wchar_t wc;
1075
1076     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1077                 if (wc <= 0xff) {
1078                         /* If we're here, wc != 0. */
1079                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1080                                 return -1;
1081                         }
1082                         ++count;
1083                         continue;
1084                 } else {
1085                         return -1;
1086                 }
1087         }
1088
1089         return count;
1090 }
1091
1092 #endif /*  __UCLIBC_HAS_LOCALE__ */
1093
1094 #endif
1095 /**********************************************************************/
1096 #ifdef L_wcwidth
1097
1098 int wcwidth(wchar_t wc)
1099 {
1100     return wcswidth(&wc, 1);
1101 }
1102
1103 #endif
1104 /**********************************************************************/
1105
1106
1107 typedef struct {
1108         mbstate_t tostate;
1109         mbstate_t fromstate;
1110         int tocodeset;
1111         int fromcodeset;
1112         int frombom;
1113         int tobom;
1114         int fromcodeset0;
1115         int frombom0;
1116         int tobom0;
1117         int skip_invalid_input;         /* To support iconv -c option. */
1118 } _UC_iconv_t;
1119
1120
1121
1122 #ifdef L_iconv
1123
1124 #include <iconv.h>
1125 #include <string.h>
1126 #include <endian.h>
1127 #include <byteswap.h>
1128
1129 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1130 #error unsupported endianness for iconv
1131 #endif
1132
1133 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1134 #error currently iconv requires 8 bit locales
1135 #endif
1136 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1137 #error currently iconv requires UTF-8 locales
1138 #endif
1139
1140
1141 enum {
1142         IC_WCHAR_T = 0xe0,
1143         IC_MULTIBYTE = 0xe0,
1144 #if __BYTE_ORDER == __BIG_ENDIAN
1145         IC_UCS_4 =      0xec,
1146         IC_UTF_32 = 0xe4,
1147         IC_UCS_2 =      0xe2,
1148         IC_UTF_16 = 0xea,
1149 #else
1150         IC_UCS_4 =      0xed,
1151         IC_UTF_32 = 0xe5,
1152         IC_UCS_2 =      0xe3,
1153         IC_UTF_16 = 0xeb,
1154 #endif
1155         IC_UTF_8 = 2,
1156         IC_ASCII = 1
1157 };
1158
1159 /* For the multibyte
1160  * bit 0 means swap endian
1161  * bit 1 means 2 byte
1162  * bit 2 means 4 byte
1163  *
1164  */
1165
1166 const unsigned char codesets[] =
1167         "\x0a\xe0""WCHAR_T\x00"         /* superset of UCS-4 but platform-endian */
1168 #if __BYTE_ORDER == __BIG_ENDIAN
1169         "\x08\xec""UCS-4\x00"           /* always BE */
1170         "\x0a\xec""UCS-4BE\x00"
1171         "\x0a\xed""UCS-4LE\x00"
1172         "\x09\fe4""UTF-32\x00"          /* platform endian with BOM */
1173         "\x0b\xe4""UTF-32BE\x00"
1174         "\x0b\xe5""UTF-32LE\x00"
1175         "\x08\xe2""UCS-2\x00"           /* always BE */
1176         "\x0a\xe2""UCS-2BE\x00"
1177         "\x0a\xe3""UCS-2LE\x00"
1178         "\x09\xea""UTF-16\x00"          /* platform endian with BOM */
1179         "\x0b\xea""UTF-16BE\x00"
1180         "\x0b\xeb""UTF-16LE\x00"
1181 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1182         "\x08\xed""UCS-4\x00"           /* always BE */
1183         "\x0a\xed""UCS-4BE\x00"
1184         "\x0a\xec""UCS-4LE\x00"
1185         "\x09\xf4""UTF-32\x00"          /* platform endian with BOM */
1186         "\x0b\xe5""UTF-32BE\x00"
1187         "\x0b\xe4""UTF-32LE\x00"
1188         "\x08\xe3""UCS-2\x00"           /* always BE */
1189         "\x0a\xe3""UCS-2BE\x00"
1190         "\x0a\xe2""UCS-2LE\x00"
1191         "\x09\xfa""UTF-16\x00"          /* platform endian with BOM */
1192         "\x0b\xeb""UTF-16BE\x00"
1193         "\x0b\xea""UTF-16LE\x00"
1194 #endif
1195         "\x08\x02""UTF-8\x00"
1196         "\x0b\x01""US-ASCII\x00"
1197         "\x07\x01""ASCII";                      /* Must be last! (special case to save a nul) */
1198
1199 static int find_codeset(const char *name)
1200 {
1201         const unsigned char *s;
1202         int codeset;
1203
1204         for (s = codesets ; *s ; s += *s) {
1205                 if (!strcasecmp(s+2, name)) {
1206                         return s[1];
1207                 }
1208         }
1209
1210         /* The following is ripped from find_locale in locale.c. */
1211
1212         /* TODO: maybe CODESET_LIST + *s ??? */
1213         /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1214         codeset = 2;
1215         s = CODESET_LIST;
1216         do {
1217                 ++codeset;              /* Increment codeset first. */
1218                 if (!strcasecmp(CODESET_LIST+*s, name)) {
1219                         return codeset;
1220                 }
1221         } while (*++s);
1222
1223         return 0;                       /* No matching codeset! */
1224 }
1225
1226 iconv_t iconv_open(const char *tocode, const char *fromcode)
1227 {
1228         register _UC_iconv_t *px;
1229         int tocodeset, fromcodeset;
1230
1231         if (((tocodeset = find_codeset(tocode)) != 0)
1232                 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1233                 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1234                         px->tocodeset = tocodeset;
1235                         px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
1236                         px->fromcodeset0 = px->fromcodeset = fromcodeset;
1237                         px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
1238                         px->skip_invalid_input = px->tostate.mask = px->fromstate.mask = 0;
1239                         return (iconv_t) px;
1240                 }
1241         } else {
1242                 __set_errno(EINVAL);
1243         }
1244         return (iconv_t)(-1);
1245 }
1246
1247 int iconv_close(iconv_t cd)
1248 {
1249         free(cd);
1250
1251         return 0;
1252 }
1253
1254 size_t iconv(iconv_t cd, char **__restrict inbuf,
1255                          size_t *__restrict inbytesleft,
1256                      char **__restrict outbuf, size_t *__restrict outbytesleft)
1257 {
1258         _UC_iconv_t *px = (_UC_iconv_t *) cd;
1259         size_t nrcount, r;
1260         wchar_t wc, wc2;
1261         int inci, inco;
1262
1263         assert(px != (_UC_iconv_t *)(-1));
1264         assert(sizeof(wchar_t) == 4);
1265
1266         if (!inbuf || !*inbuf) {        /* Need to reinitialze conversion state. */
1267                 /* Note: For shift-state encodings we possibly need to output the
1268                  * shift sequence to return to initial state! */
1269                 if ((px->fromcodeset & 0xf0) == 0xe0) {
1270                 }
1271                 px->tostate.mask = px->fromstate.mask = 0;
1272                 px->fromcodeset = px->fromcodeset0;
1273                 px->tobom = px->tobom0;
1274                 px->frombom = px->frombom0;
1275                 return 0;
1276         }
1277
1278         nrcount = 0;
1279         while (*inbytesleft) {
1280                 if (!*outbytesleft) {
1281                 TOO_BIG:
1282                         __set_errno(E2BIG);
1283                         return (size_t) -1;
1284                 }
1285
1286                 inci = inco = 1;
1287                 if (px->fromcodeset >= IC_MULTIBYTE) {
1288                         inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1289                         if (*inbytesleft < inci) goto INVALID;
1290                         wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1291                                 + ((unsigned char)((*inbuf)[1]));
1292                         if (inci == 4) {
1293                                 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1294                                         + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1295                                 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1296                         } else {
1297                                 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1298                                 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1299                                          && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1300                                         ) {                     /* surrogate */
1301                                         wc =- 0xd800U;
1302                                         if (*inbytesleft < 4) goto INVALID;
1303                                         wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1304                                                 + ((unsigned char)((*inbuf)[3]));
1305                                         if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1306                                         if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1307                                                 goto ILLEGAL;
1308                                         }
1309                                         inci = 4;       /* Change inci here in case skipping illegals. */
1310                                         wc = 0x10000UL + (wc << 10) + wc2;
1311                                 }
1312                         }
1313
1314                         if (px->frombom) {
1315                                 px->frombom = 0;
1316                                 if ((wc == 0xfeffU)
1317                                         || (wc == ((inci == 4)
1318                                                            ? (((wchar_t) 0xfffe0000UL))
1319                                                            : ((wchar_t)(0xfffeUL))))
1320                                         ) {
1321                                         if (wc != 0xfeffU) {
1322                                                 px->fromcodeset ^= 1; /* toggle endianness */
1323                                                 wc = 0xfeffU;
1324                                         }
1325                                         if (!px->frombom) {
1326                                                 goto BOM_SKIP_OUTPUT;
1327                                         }
1328                                         goto GOT_BOM;
1329                                 }
1330                         }
1331
1332                         if (px->fromcodeset != IC_WCHAR_T) {
1333                                 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1334                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1335 #ifdef KUHN
1336                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1337                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1338 #endif
1339                                         ) {
1340                                         goto ILLEGAL;
1341                                 }
1342                         }
1343                 } else if (px->fromcodeset == IC_UTF_8) {
1344                         const char *p = *inbuf;
1345                         r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1346                         if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1347                                 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1348                                         assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1349                                         if (r == (size_t)(-2)) {
1350                                         INVALID:
1351                                                 __set_errno(EINVAL);
1352                                         } else {
1353                                                 px->fromstate.mask = 0;
1354                                                 inci = 1;
1355                                         ILLEGAL:
1356                                                 if (px->skip_invalid_input) {
1357                                                         px->skip_invalid_input = 2;     /* flag for iconv utility */
1358                                                         goto BOM_SKIP_OUTPUT;
1359                                                 }
1360                                                 __set_errno(EILSEQ);
1361                                         }
1362                                         return (size_t)(-1);
1363                                 }
1364 #ifdef __UCLIBC_MJN3_ONLY__
1365 #warning optimize this
1366 #endif
1367                                 if (p != NULL) { /* incomplet char case */
1368                                         goto INVALID;
1369                                 }
1370                                 p = *inbuf + 1; /* nul */
1371                         }
1372                         inci = p - *inbuf;
1373                 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1374                         if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1375                                 goto ILLEGAL;
1376                         } else {                        /* some other 8-bit ascii-extension codeset */
1377                                 const codeset_8_bit_t *c8b
1378                                         = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1379                                 wc -= 0x80;
1380                                 wc = __global_locale.tbl8c2wc[
1381                                                          (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1382                                                           << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1383                                 if (!wc) {
1384                                         goto ILLEGAL;
1385                                 }
1386                         }
1387                 }
1388
1389
1390                 if (px->tobom) {
1391                         inci = 0;
1392                         wc = 0xfeffU;
1393         GOT_BOM:
1394                         px->tobom = 0;
1395                 }
1396
1397                 if (px->tocodeset >= IC_MULTIBYTE) {
1398                         inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1399                         if (*outbytesleft < inci) goto TOO_BIG;
1400                         if (px->tocodeset != IC_WCHAR_T) {
1401                                 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1402                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1403 #ifdef KUHN
1404                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1405                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1406 #endif
1407                                         ) {
1408                                 REPLACE_32:
1409                                         wc = 0xfffd;
1410                                         ++nrcount;
1411                                 }
1412                         }
1413                         if (inco == 4) {
1414                                 if (px->tocodeset & 1) wc = bswap_32(wc);
1415                         } else {
1416                                 if (((__uwchar_t)wc ) > 0xffffU) {
1417                                         if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1418                                                 goto REPLACE_32;
1419                                         }
1420                                         if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1421                                         wc2 = 0xdc00U + (wc & 0x3ff);
1422                                         wc = 0xd800U + ((wc >> 10) & 0x3ff);
1423                                         if (px->tocodeset & 1) {
1424                                                 wc = bswap_16(wc);
1425                                                 wc2 = bswap_16(wc2);
1426                                         }
1427                                         wc += (wc2 << 16);
1428                                 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1429                         }
1430                         (*outbuf)[0] = (char)((unsigned char)(wc));
1431                         (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1432                         if (inco == 4) {
1433                                 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1434                                 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1435                         }
1436                 } else if (px->tocodeset == IC_UTF_8) {
1437                         const wchar_t *pw = &wc;
1438                         do {
1439                                 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1440                                 if (r != (size_t)(-1)) {
1441 #ifdef __UCLIBC_MJN3_ONLY__
1442 #warning what happens for a nul?
1443 #endif
1444                                         if (r == 0) {
1445                                                 if (wc != 0) {
1446                                                         goto TOO_BIG;
1447                                                 }
1448                                                 ++r;
1449                                         }
1450                                         break;
1451                                 }
1452                                 wc = 0xfffdU;
1453                                 ++nrcount;
1454                         } while (1);
1455                         inco = r;
1456                 } else if (((__uwchar_t)(wc)) < 0x80) {
1457                 CHAR_GOOD:
1458                                 **outbuf = wc;
1459                 } else {
1460                         if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1461                                 const codeset_8_bit_t *c8b
1462                                         = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1463                                 __uwchar_t u;
1464                                 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1465                                 u = __global_locale.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1466                                                  + ((wc >> Cwc2c_TT_SHIFT)
1467                                                         & ((1 << Cwc2c_TI_SHIFT)-1))];
1468                                 wc = __global_locale.tbl8wc2c[Cwc2c_TI_LEN
1469                                                  + (u << Cwc2c_TT_SHIFT)
1470                                                  + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1471                                 if (wc) {
1472                                         goto CHAR_GOOD;
1473                                 }
1474                         }
1475                         **outbuf = '?';
1476                         ++nrcount;
1477                 }
1478
1479                 *outbuf += inco;
1480                 *outbytesleft -= inco;
1481         BOM_SKIP_OUTPUT:
1482                 *inbuf += inci;
1483                 *inbytesleft -= inci;
1484         }
1485         return nrcount;
1486 }
1487
1488 #endif
1489 /**********************************************************************/
1490 #ifdef L_iconv_main
1491
1492 #include <stdio.h>
1493 #include <stdlib.h>
1494 #include <string.h>
1495 #include <wchar.h>
1496 #include <iconv.h>
1497 #include <stdarg.h>
1498 #include <libgen.h>
1499
1500 extern const unsigned char codesets[];
1501
1502 #define IBUF BUFSIZ
1503 #define OBUF BUFSIZ
1504
1505 char *progname;
1506 int hide_errors;
1507
1508 static void error_msg(const char *fmt, ...)
1509          __attribute__ ((noreturn, format (printf, 1, 2)));
1510
1511 static void error_msg(const char *fmt, ...)
1512 {
1513         va_list arg;
1514
1515         if (!hide_errors) {
1516                 fprintf(stderr, "%s: ", progname);
1517                 va_start(arg, fmt);
1518                 vfprintf(stderr, fmt, arg);
1519                 va_end(arg);
1520         }
1521
1522         exit(EXIT_FAILURE);
1523 }
1524
1525 int main(int argc, char **argv)
1526 {
1527         FILE *ifile;
1528         FILE *ofile = stdout;
1529         const char *p;
1530         const char *s;
1531         static const char opt_chars[] = "tfocsl";
1532                                       /* 012345 */
1533         const char *opts[sizeof(opt_chars)]; /* last is infile name */
1534         iconv_t ic;
1535         char ibuf[IBUF];
1536         char obuf[OBUF];
1537         char *pi;
1538         char *po;
1539         size_t ni, no, r, pos;
1540
1541         hide_errors = 0;
1542
1543         for (s = opt_chars ; *s ; s++) {
1544                 opts[ s - opt_chars ] = NULL;
1545         }
1546
1547         progname = *argv;
1548         while (--argc) {
1549                 p = *++argv;
1550                 if ((*p != '-') || (*++p == 0)) {
1551                         break;
1552                 }
1553                 do {
1554                         if ((s = strchr(opt_chars,*p)) == NULL) {
1555                         USAGE:
1556                                 s = basename(progname);
1557                                 fprintf(stderr,
1558                                                 "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
1559                                                 "  or\n%s -l\n", s, s);
1560                                 return EXIT_FAILURE;
1561                         }
1562                         if ((s - opt_chars) < 3) {
1563                                 if ((--argc == 0) || opts[s - opt_chars]) {
1564                                         goto USAGE;
1565                                 }
1566                                 opts[s - opt_chars] = *++argv;
1567                         } else {
1568                                 opts[s - opt_chars] = p;
1569                         }
1570                 } while (*++p);
1571         }
1572
1573         if (opts[5]) {                          /* -l */
1574                 fprintf(stderr, "Recognized codesets:\n");
1575                 for (s = codesets ; *s ; s += *s) {
1576                         fprintf(stderr,"  %s\n", s+2);
1577                 }
1578                 s = CODESET_LIST;
1579                 do {
1580                         fprintf(stderr,"  %s\n", CODESET_LIST+ (unsigned char)(*s));
1581                 } while (*++s);
1582
1583                 return EXIT_SUCCESS;
1584         }
1585
1586         if (opts[4]) {
1587                 hide_errors = 1;
1588         }
1589
1590         if (!opts[0] || !opts[1]) {
1591                 goto USAGE;
1592         }
1593         if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
1594                 error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
1595         }
1596         if (opts[3]) {                          /* -c */
1597                 ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
1598         }
1599
1600         if ((s = opts[2]) != NULL) {
1601                 if (!(ofile = fopen(s, "w"))) {
1602                         error_msg( "couldn't open %s for writing\n", s);
1603                 }
1604         }
1605
1606         pos = ni = 0;
1607         do {
1608                 if (!argc || ((**argv == '-') && !((*argv)[1]))) {
1609                         ifile = stdin;          /* we don't check for duplicates */
1610                 } else if (!(ifile = fopen(*argv, "r"))) {
1611                         error_msg( "couldn't open %s for reading\n", *argv);
1612                 }
1613
1614                 while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
1615                         pos += r;
1616                         ni += r;
1617                         no = OBUF;
1618                         pi = ibuf;
1619                         po = obuf;
1620                         if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
1621                                 if ((errno != EINVAL) && (errno != E2BIG)) {
1622                                         error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
1623                                 }
1624                         }
1625                         if ((r = OBUF - no) > 0) {
1626                                 if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
1627                                         error_msg( "write error\n");
1628                                 }
1629                         }
1630                         if (ni) {                       /* still bytes in buffer! */
1631                                 memmove(ibuf, pi, ni);
1632                         }
1633                 }
1634
1635                 if (ferror(ifile)) {
1636                         error_msg( "read error\n");
1637                 }
1638
1639                 ++argv;
1640
1641                 if (ifile != stdin) {
1642                         fclose(ifile);
1643                 }
1644
1645         } while (--argc > 0);
1646
1647         iconv_close(ic);
1648
1649         if (ni) {
1650                 error_msg( "incomplete sequence\n");
1651         }
1652
1653         return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
1654                 ? EXIT_SUCCESS : EXIT_FAILURE;
1655 }
1656
1657 #endif
1658 /**********************************************************************/