libc/misc/wchar/wchar.c

   1
   2 /*  Copyright (C) 2002, 2003, 2004     Manuel Novoa III
   3  *
   4  *  This library is free software; you can redistribute it and/or
   5  *  modify it under the terms of the GNU Library General Public
   6  *  License as published by the Free Software Foundation; either
   7  *  version 2 of the License, or (at your option) any later version.
   8  *
   9  *  This library is distributed in the hope that it will be useful,
  10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  *  Library General Public License for more details.
  13  *
  14  *  You should have received a copy of the GNU Library General Public
  15  *  License along with this library; if not, write to the Free
  16  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17  */
  18
  19 /*  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!
  20  *
  21  *  Besides uClibc, I'm using this code in my libc for elks, which is
  22  *  a 16-bit environment with a fairly limited compiler.  It would make
  23  *  things much easier for me if this file isn't modified unnecessarily.
  24  *  In particular, please put any new or replacement functions somewhere
  25  *  else, and modify the makefile to use your version instead.
  26  *  Thanks.  Manuel
  27  *
  28  *  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION! */
  29
  30
  31 /* May 23, 2002     Initial Notes:
  32  *
  33  * I'm still tweaking this stuff, but it passes the tests I've thrown
  34  * at it, and Erik needs it for the gcc port.  The glibc extension
  35  * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
  36  * in the glibc source.  I also need to fix the behavior of
  37  * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
  38  *
  39  * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
  40  * file on my platform (x86) show about 5-10% faster conversion speed than
  41  * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
  42  * individual mbrtowc()/wcrtomb() calls.
  43  *
  44  * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
  45  * as a fail-safe UTF-8 decoder appropriate for a terminal, etc.  which
  46  * needs to deal gracefully with whatever is sent to it.  In that mode,
  47  * it passes Markus Kuhn's UTF-8-test.txt stress test.  I plan to add
  48  * an arg to force that behavior, so the interface will be changing.
  49  *
  50  * I need to fix the error checking for 16-bit wide chars.  This isn't
  51  * an issue for uClibc, but may be for ELKS.  I'm currently not sure
  52  * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
  53  *
  54  * July 1, 2002
  55  *
  56  * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
  57  * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
  58  *    locales.
  59  * Enabled building of a C/POSIX-locale-only version, so full locale support
  60  *    no longer needs to be enabled.
  61  *
  62  * Nov 4, 2002
  63  *
  64  * Fixed a bug in _wchar_wcsntoutf8s().  Don't store wcs position if dst is NULL.
  65  * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
  66  *   order to support %ls in printf.  See comments below for details.
  67  * Change behaviour of wc<->mb functions when in the C locale.  Now they do
  68  *   a 1-1 map for the range 0x80-UCHAR_MAX.  This is for backwards compatibility
  69  *   and consistency with the stds requirements that a printf format string by
  70  *   a valid multibyte string beginning and ending in it's initial shift state.
  71  *
  72  * Nov 5, 2002
  73  *
  74  * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
  75  *
  76  * Nov 7, 2002
  77  *
  78  * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
  79  *   Added some size/speed optimizations and integrated it into my locale
  80  *   framework.  Minimally tested at the moment, but the stub C-locale
  81  *   version (which most people would probably be using) should be fine.
  82  *
  83  * Nov 21, 2002
  84  *
  85  * Revert the wc<->mb changes from earlier this month involving the C-locale.
  86  * Add a couple of ugly hacks to support *wprintf.
  87  * Add a mini iconv() and iconv implementation (requires locale support).
  88  *
  89  * Aug 1, 2003
  90  * Bug fix for mbrtowc.
  91  *
  92  * Aug 18, 2003
  93  * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
  94  *
  95  * Feb 11, 2004
  96  * Bug fix: Fix size check for remaining output space in iconv().
  97  *
  98  * Manuel
  99  */
 100 #ifdef _LIBC
 101 #include <errno.h>
 102 #include <stddef.h>
 103 #include <limits.h>
 104 #include <stdint.h>
 105 #include <inttypes.h>
 106 #include <stdlib.h>
 107 #include <stdio.h>
 108 #include <assert.h>
 109 #include <locale.h>
 110 #include <wchar.h>
 111 #include <bits/uClibc_uwchar.h>
 112
 113 /**********************************************************************/
 114 #ifdef __UCLIBC_HAS_LOCALE__
 115 #ifdef __UCLIBC_MJN3_ONLY__
 116 #ifdef L_iswspace
 117 /* generates one warning */
 118 #warning TODO: Fix Cc2wc* and Cwc2c* defines!
 119 #endif
 120 #endif /* __UCLIBC_MJN3_ONLY__ */
 121
 122 #define ENCODING                (__UCLIBC_CURLOCALE->encoding)
 123
 124 #define Cc2wc_IDX_SHIFT         __LOCALE_DATA_Cc2wc_IDX_SHIFT
 125 #define Cc2wc_ROW_LEN           __LOCALE_DATA_Cc2wc_ROW_LEN
 126 #define Cwc2c_DOMAIN_MAX        __LOCALE_DATA_Cwc2c_DOMAIN_MAX
 127 #define Cwc2c_TI_SHIFT          __LOCALE_DATA_Cwc2c_TI_SHIFT
 128 #define Cwc2c_TT_SHIFT          __LOCALE_DATA_Cwc2c_TT_SHIFT
 129 #define Cwc2c_TI_LEN            __LOCALE_DATA_Cwc2c_TI_LEN
 130
 131 #ifndef __CTYPE_HAS_UTF_8_LOCALES
 132 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
 133 #endif
 134
 135 #else  /* __UCLIBC_HAS_LOCALE__ */
 136
 137 #ifdef __UCLIBC_MJN3_ONLY__
 138 #ifdef L_btowc
 139 /* emit only once */
 140 #warning fix preprocessor logic testing locale settings
 141 #endif
 142 #endif
 143
 144 #define ENCODING (__ctype_encoding_7_bit)
 145 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 146 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
 147 #endif
 148 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 149 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
 150 #endif
 151 #undef L__wchar_utf8sntowcs
 152 #undef L__wchar_wcsntoutf8s
 153
 154 #endif /* __UCLIBC_HAS_LOCALE__ */
 155 /**********************************************************************/
 156
 157 #if WCHAR_MAX > 0xffffUL
 158 #define UTF_8_MAX_LEN 6
 159 #else
 160 #define UTF_8_MAX_LEN 3
 161 #endif
 162
 163 #define KUHN 1
 164
 165 /* Implementation-specific work functions. */
 166
 167 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 168                                         const char **__restrict src, size_t n,
 169                                         mbstate_t *ps, int allow_continuation) attribute_hidden;
 170
 171 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 172                                         const wchar_t **__restrict src, size_t wn) attribute_hidden;
 173 #endif
 174 /**********************************************************************/
 175 #ifdef L_btowc
 176
 177
 178 wint_t btowc(int c)
 179 {
 180 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 181
 182         wchar_t wc;
 183         unsigned char buf[1];
 184         mbstate_t mbstate;
 185
 186         if (c != EOF) {
 187                 *buf = (unsigned char) c;
 188                 mbstate.__mask = 0;             /* Initialize the mbstate. */
 189                 if (mbrtowc(&wc, (char*) buf, 1, &mbstate) <= 1) {
 190                         return wc;
 191                 }
 192         }
 193         return WEOF;
 194
 195 #else  /* !__CTYPE_HAS_8_BIT_LOCALES */
 196
 197 #ifdef __UCLIBC_HAS_LOCALE__
 198         assert((ENCODING == __ctype_encoding_7_bit)
 199                    || (ENCODING == __ctype_encoding_utf8));
 200 #endif
 201
 202         /* If we don't have 8-bit locale support, then this is trivial since
 203          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 204         return (((unsigned int)c) < 0x80) ? c : WEOF;
 205
 206 #endif /* !__CTYPE_HAS_8_BIT_LOCALES */
 207 }
 208 libc_hidden_def(btowc)
 209
 210 #endif
 211 /**********************************************************************/
 212 #ifdef L_wctob
 213
 214 /* Note: We completely ignore ps in all currently supported conversions. */
 215
 216
 217 int wctob(wint_t c)
 218 {
 219 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 220
 221         unsigned char buf[MB_LEN_MAX];
 222
 223         return (wcrtomb((char*) buf, c, NULL) == 1) ? *buf : EOF;
 224
 225 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
 226
 227 #ifdef __UCLIBC_HAS_LOCALE__
 228         assert((ENCODING == __ctype_encoding_7_bit)
 229                    || (ENCODING == __ctype_encoding_utf8));
 230 #endif /* __UCLIBC_HAS_LOCALE__ */
 231
 232         /* If we don't have 8-bit locale support, then this is trivial since
 233          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 234
 235         /* TODO: need unsigned version of wint_t... */
 236 /*      return (((unsigned int)c) < 0x80) ? c : WEOF; */
 237         return ((c >= 0) && (c < 0x80)) ? c : EOF;
 238
 239 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
 240 }
 241
 242 #endif
 243 /**********************************************************************/
 244 #ifdef L_mbsinit
 245
 246 int mbsinit(const mbstate_t *ps)
 247 {
 248         return !ps || !ps->__mask;
 249 }
 250 libc_hidden_def(mbsinit)
 251
 252 #endif
 253 /**********************************************************************/
 254 #ifdef L_mbrlen
 255
 256
 257 size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
 258 {
 259         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 260
 261         return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
 262 }
 263 libc_hidden_def(mbrlen)
 264
 265 #endif
 266 /**********************************************************************/
 267 #ifdef L_mbrtowc
 268
 269
 270 size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
 271                            size_t n, mbstate_t *__restrict ps)
 272 {
 273         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 274         wchar_t wcbuf[1];
 275         const char *p;
 276         size_t r;
 277         char empty_string[1];           /* Avoid static to be fPIC friendly. */
 278
 279         if (!ps) {
 280                 ps = &mbstate;
 281         }
 282
 283         if (!s) {
 284                 pwc = (wchar_t *) s;    /* NULL */
 285                 empty_string[0] = 0;    /* Init the empty string when necessary. */
 286                 s = empty_string;
 287                 n = 1;
 288         } else if (*s == '\0') {
 289                 if (pwc)
 290                         *pwc = '\0';
 291         /* According to the ISO C 89 standard this is the expected behaviour.  */
 292                 return 0;
 293         } else if (!n) {
 294                 /* TODO: change error code? */
 295 #if 0
 296                 return (ps->__mask && (ps->__wc == 0xffffU))
 297                         ? ((size_t) -1) : ((size_t) -2);
 298 #else
 299                 return 0;
 300 #endif
 301         }
 302
 303         p = s;
 304
 305 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 306         /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
 307         if (ENCODING == __ctype_encoding_utf8) {
 308                 if (!pwc) {
 309                         pwc = wcbuf;
 310                 }
 311                 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
 312                 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
 313         }
 314 #endif
 315
 316 #ifdef __UCLIBC_MJN3_ONLY__
 317 #warning TODO: This adds a trailing nul!
 318 #endif /* __UCLIBC_MJN3_ONLY__ */
 319
 320         r = mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
 321
 322         if (((ssize_t) r) >= 0) {
 323                 if (pwc) {
 324                         *pwc = *wcbuf;
 325                 }
 326         }
 327         return (size_t) r;
 328 }
 329 libc_hidden_def(mbrtowc)
 330
 331 #endif
 332 /**********************************************************************/
 333 #ifdef L_wcrtomb
 334
 335
 336 /* Note: We completely ignore ps in all currently supported conversions. */
 337 /* TODO: Check for valid state anyway? */
 338
 339 size_t wcrtomb(register char *__restrict s, wchar_t wc,
 340                            mbstate_t *__restrict ps)
 341 {
 342 #ifdef __UCLIBC_MJN3_ONLY__
 343 #warning TODO: Should wcsnrtombs nul-terminate unconditionally?  Check glibc.
 344 #endif /* __UCLIBC_MJN3_ONLY__ */
 345         wchar_t wcbuf[1];
 346         const wchar_t *pwc;
 347         size_t r;
 348         char buf[MB_LEN_MAX];
 349
 350         if (!s) {
 351                 s = buf;
 352                 wc = 0;
 353         }
 354
 355         pwc = wcbuf;
 356         wcbuf[0] = wc;
 357
 358         r = wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
 359         return (r != 0) ? r : 1;
 360 }
 361 libc_hidden_def(wcrtomb)
 362
 363 #endif
 364 /**********************************************************************/
 365 #ifdef L_mbsrtowcs
 366
 367
 368 size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 369                                  size_t len, mbstate_t *__restrict ps)
 370 {
 371         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 372
 373         return mbsnrtowcs(dst, src, SIZE_MAX, len,
 374                                                 ((ps != NULL) ? ps : &mbstate));
 375 }
 376 libc_hidden_def(mbsrtowcs)
 377
 378 #endif
 379 /**********************************************************************/
 380 #ifdef L_wcsrtombs
 381
 382 /* Note: We completely ignore ps in all currently supported conversions.
 383
 384  * TODO: Check for valid state anyway? */
 385
 386
 387 size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
 388                                  size_t len, mbstate_t *__restrict ps)
 389 {
 390         return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
 391 }
 392 libc_hidden_def(wcsrtombs)
 393
 394 #endif
 395 /**********************************************************************/
 396 #ifdef L__wchar_utf8sntowcs
 397
 398 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
 399  * UTF-8-test.txt strss test.
 400  */
 401 /*  #define DECODER */
 402
 403 #ifdef DECODER
 404 #ifndef KUHN
 405 #define KUHN
 406 #endif
 407 #endif
 408
 409 size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 410                                                   const char **__restrict src, size_t n,
 411                                                   mbstate_t *ps, int allow_continuation)
 412 {
 413         register const char *s;
 414         __uwchar_t mask;
 415         __uwchar_t wc;
 416         wchar_t wcbuf[1];
 417         size_t count;
 418         int incr;
 419
 420         s = *src;
 421
 422         assert(s != NULL);
 423         assert(ps != NULL);
 424
 425         incr = 1;
 426         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 427          * wprintf, we need to be able to compute the number of wchars needed
 428          * for the mbs conversion, not to exceed the precision specified.
 429          * But if dst is NULL, the return value is the length assuming a
 430          * sufficiently sized buffer.  So, we allow passing of (wchar_t *) ps
 431          * as pwc in order to flag that we really want the length, subject
 432          * to the restricted buffer size and no partial conversions.
 433          * See mbsnrtowcs() as well. */
 434         if (!pwc || (pwc == ((wchar_t *)ps))) {
 435                 if (!pwc) {
 436                         wn = SIZE_MAX;
 437                 }
 438                 pwc = wcbuf;
 439                 incr = 0;
 440         }
 441
 442         /* This is really here only to support the glibc extension function
 443          * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
 444          * check on the validity of the mbstate. */
 445         if (!(count = wn)) {
 446                 return 0;
 447         }
 448
 449         if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
 450 #ifdef DECODER
 451                 wc = (__uwchar_t) ps->__wc;
 452                 if (n) {
 453                         goto CONTINUE;
 454                 }
 455                 goto DONE;
 456 #else
 457                 if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
 458                         /* TODO: change error code here and below? */
 459                         if (n) {
 460                                 goto CONTINUE;
 461                         }
 462                         goto DONE;
 463                 }
 464                 __set_errno(EILSEQ);
 465                 return (size_t) -1;             /* We're in an error state. */
 466 #endif
 467         }
 468
 469         do {
 470                 if (!n) {
 471                         goto DONE;
 472                 }
 473                 --n;
 474                 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
 475                         mask = 0x40;
 476 #ifdef __UCLIBC_MJN3_ONLY__
 477 #warning TODO: Fix range for 16 bit wchar_t case.
 478 #endif
 479                         if (( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) &&
 480                         (((unsigned char)s[-1] != 0xc0 ) && ((unsigned char)s[-1] != 0xc1 ))) {
 481                                 goto START;
 482                         }
 483                 BAD:
 484 #ifdef DECODER
 485                         wc = 0xfffdU;
 486                         goto COMPLETE;
 487 #else
 488                         ps->__mask = mask;
 489                         ps->__wc = 0xffffU;
 490                         __set_errno(EILSEQ);
 491                         return (size_t) -1;     /* Illegal start byte! */
 492 #endif
 493
 494                 CONTINUE:
 495                         while (n) {
 496                                 --n;
 497                                 if ((*s & 0xc0) != 0x80) {
 498                                         goto BAD;
 499                                 }
 500                                 mask <<= 5;
 501                                 wc <<= 6;
 502                                 wc += (*s & 0x3f);      /* keep seperate for bcc (smaller code) */
 503                                 ++s;
 504                         START:
 505                                 wc &= ~(mask << 1);
 506
 507                                 if ((wc & mask) == 0) { /* Character completed. */
 508                                         if ((mask >>= 5) == 0x40) {
 509                                                 mask += mask;
 510                                         }
 511                                         /* Check for invalid sequences (longer than necessary)
 512                                          * and invalid chars.  */
 513                                         if ( (wc < mask) /* Sequence not minimal length. */
 514 #ifdef KUHN
 515 #if UTF_8_MAX_LEN == 3
 516 #error broken since mask can overflow!!
 517                                                  /* For plane 0, these are the only defined values.*/
 518                                                  || (wc > 0xfffdU)
 519 #else
 520                                                  /* Note that we don't need to worry about exceeding */
 521                                                  /* 31 bits as that is the most that UTF-8 provides. */
 522                                                  || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 523 #endif
 524                                                  || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 525 #endif /* KUHN */
 526                                                  ) {
 527                                                 goto BAD;
 528                                         }
 529                                         goto COMPLETE;
 530                                 }
 531                         }
 532                         /* Character potentially valid but incomplete. */
 533                         if (!allow_continuation) {
 534                                 if (count != wn) {
 535                                         return 0;
 536                                 }
 537                                 /* NOTE: The following can fail if you allow and then disallow
 538                                  * continuation!!! */
 539 #if UTF_8_MAX_LEN == 3
 540 #error broken since mask can overflow!!
 541 #endif
 542                                 /* Need to back up... */
 543                                 do {
 544                                         --s;
 545                                 } while ((mask >>= 5) >= 0x40);
 546                                 goto DONE;
 547                         }
 548                         ps->__mask = (wchar_t) mask;
 549                         ps->__wc = (wchar_t) wc;
 550                         *src = s;
 551                         return (size_t) -2;
 552                 }
 553         COMPLETE:
 554                 *pwc = wc;
 555                 pwc += incr;
 556         }
 557 #ifdef DECODER
 558         while (--count);
 559 #else
 560         while (wc && --count);
 561
 562         if (!wc) {
 563                 s = NULL;
 564         }
 565 #endif
 566
 567  DONE:
 568         /* ps->__wc is irrelavent here. */
 569         ps->__mask = 0;
 570         if (pwc != wcbuf) {
 571                 *src = s;
 572         }
 573
 574         return wn - count;
 575 }
 576
 577 #endif
 578 /**********************************************************************/
 579 #ifdef L__wchar_wcsntoutf8s
 580
 581 size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 582                                                   const wchar_t **__restrict src, size_t wn)
 583 {
 584         register char *p;
 585         size_t len, t;
 586         __uwchar_t wc;
 587         const __uwchar_t *swc;
 588         int store;
 589         char buf[MB_LEN_MAX];
 590         char m;
 591
 592         store = 1;
 593         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 594          * printf, we need to be able to compute the number of bytes needed
 595          * for the mbs conversion, not to exceed the precision specified.
 596          * But if dst is NULL, the return value is the length assuming a
 597          * sufficiently sized buffer.  So, we allow passing of (char *) src
 598          * as dst in order to flag that we really want the length, subject
 599          * to the restricted buffer size and no partial conversions.
 600          * See wcsnrtombs() as well. */
 601         if (!s || (s == ((char *) src))) {
 602                 if (!s) {
 603                         n = SIZE_MAX;
 604                 }
 605                 s = buf;
 606                 store = 0;
 607         }
 608
 609         t = n;
 610         swc = (const __uwchar_t *) *src;
 611
 612         assert(swc != NULL);
 613
 614         while (wn && t) {
 615                 wc = *swc;
 616
 617                 *s = wc;
 618                 len = 1;
 619
 620                 if (wc >= 0x80) {
 621 #ifdef KUHN
 622                         if (
 623 #if UTF_8_MAX_LEN == 3
 624                                 /* For plane 0, these are the only defined values.*/
 625                                 /* Note that we don't need to worry about exceeding */
 626                                 /* 31 bits as that is the most that UTF-8 provides. */
 627                                 (wc > 0xfffdU)
 628 #else
 629                                 /* UTF_8_MAX_LEN == 6 */
 630                                 (wc > 0x7fffffffUL)
 631                                 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 632 #endif
 633                                 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 634                                 ) {
 635                                 __set_errno(EILSEQ);
 636                                 return (size_t) -1;
 637                         }
 638 #else  /* KUHN */
 639 #if UTF_8_MAX_LEN != 3
 640                         if (wc > 0x7fffffffUL) { /* Value too large. */
 641                                 __set_errno(EILSEQ);
 642                                 return (size_t) -1;
 643                         }
 644 #endif
 645 #endif /* KUHN */
 646
 647                         wc >>= 1;
 648                         p = s;
 649                         do {
 650                                 ++p;
 651                         } while (wc >>= 5);
 652                         wc = *swc;
 653                         if ((len = p - s) > t) { /* Not enough space. */
 654                                 break;
 655                         }
 656
 657                         m = 0x80;
 658                         while( p>s ) {
 659                                 m = (m >> 1) | 0x80;
 660                                 *--p = (wc & 0x3f) | 0x80;
 661                                 wc >>= 6;
 662                         }
 663                         *s |= (m << 1);
 664                 } else if (wc == 0) {   /* End of string. */
 665                         swc = NULL;
 666                         break;
 667                 }
 668
 669                 ++swc;
 670                 --wn;
 671                 t -= len;
 672                 if (store) {
 673                         s += len;
 674                 }
 675         }
 676
 677         if (store) {
 678                 *src = (const wchar_t *) swc;
 679         }
 680
 681         return n - t;
 682 }
 683
 684
 685 #endif
 686 /**********************************************************************/
 687 #ifdef L_mbsnrtowcs
 688
 689 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 690
 691 size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 692                                         size_t NMC, size_t len, mbstate_t *__restrict ps)
 693 {
 694         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 695         wchar_t wcbuf[1];
 696         const char *s;
 697         size_t count;
 698         int incr;
 699
 700         if (!ps) {
 701                 ps = &mbstate;
 702         }
 703
 704 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 705         if (ENCODING == __ctype_encoding_utf8) {
 706                 size_t r;
 707                 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
 708                                 != (size_t) -2) ? r : 0;
 709         }
 710 #endif
 711         incr = 1;
 712         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 713          * wprintf, we need to be able to compute the number of wchars needed
 714          * for the mbs conversion, not to exceed the precision specified.
 715          * But if dst is NULL, the return value is the length assuming a
 716          * sufficiently sized buffer.  So, we allow passing of ((wchar_t *)ps)
 717          * as dst in order to flag that we really want the length, subject
 718          * to the restricted buffer size and no partial conversions.
 719          * See _wchar_utf8sntowcs() as well. */
 720         if (!dst || (dst == ((wchar_t *)ps))) {
 721                 if (!dst) {
 722                         len = SIZE_MAX;
 723                 }
 724                 dst = wcbuf;
 725                 incr = 0;
 726         }
 727
 728         /* Since all the following encodings are single-byte encodings... */
 729         if (len > NMC) {
 730                 len = NMC;
 731         }
 732
 733         count = len;
 734         s = *src;
 735
 736 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 737         if (ENCODING == __ctype_encoding_8_bit) {
 738                 wchar_t wc;
 739                 while (count) {
 740                         if ((wc = ((unsigned char)(*s))) >= 0x80) {     /* Non-ASCII... */
 741                                 wc -= 0x80;
 742                                 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
 743                                                   (__UCLIBC_CURLOCALE->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
 744                                                    << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
 745                                 if (!wc) {
 746                                         goto BAD;
 747                                 }
 748                         }
 749                         if (!(*dst = wc)) {
 750                                 s = NULL;
 751                                 break;
 752                         }
 753                         dst += incr;
 754                         ++s;
 755                         --count;
 756                 }
 757                 if (dst != wcbuf) {
 758                         *src = s;
 759                 }
 760                 return len - count;
 761         }
 762 #endif
 763
 764 #ifdef __UCLIBC_HAS_LOCALE__
 765         assert(ENCODING == __ctype_encoding_7_bit);
 766 #endif
 767
 768         while (count) {
 769                 if ((*dst = (unsigned char) *s) == 0) {
 770                         s = NULL;
 771                         break;
 772                 }
 773                 if (*dst >= 0x80) {
 774 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 775                 BAD:
 776 #endif
 777                         __set_errno(EILSEQ);
 778                         return (size_t) -1;
 779                 }
 780                 ++s;
 781                 dst += incr;
 782                 --count;
 783         }
 784         if (dst != wcbuf) {
 785                 *src = s;
 786         }
 787         return len - count;
 788 }
 789 libc_hidden_def(mbsnrtowcs)
 790
 791 #endif
 792 /**********************************************************************/
 793 #ifdef L_wcsnrtombs
 794
 795 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 796
 797 /* Note: We completely ignore ps in all currently supported conversions.
 798  * TODO: Check for valid state anyway? */
 799
 800 size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
 801                                         size_t NWC, size_t len, mbstate_t *__restrict ps)
 802 {
 803         const __uwchar_t *s;
 804         size_t count;
 805         int incr;
 806         char buf[MB_LEN_MAX];
 807
 808 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 809         if (ENCODING == __ctype_encoding_utf8) {
 810                 return _wchar_wcsntoutf8s(dst, len, src, NWC);
 811         }
 812 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
 813
 814         incr = 1;
 815         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 816          * printf, we need to be able to compute the number of bytes needed
 817          * for the mbs conversion, not to exceed the precision specified.
 818          * But if dst is NULL, the return value is the length assuming a
 819          * sufficiently sized buffer.  So, we allow passing of (char *) src
 820          * as dst in order to flag that we really want the length, subject
 821          * to the restricted buffer size and no partial conversions.
 822          * See _wchar_wcsntoutf8s() as well. */
 823         if (!dst || (dst == ((char *) src))) {
 824                 if (!dst) {
 825                         len = SIZE_MAX;
 826                 }
 827                 dst = buf;
 828                 incr = 0;
 829         }
 830
 831         /* Since all the following encodings are single-byte encodings... */
 832         if (len > NWC) {
 833                 len = NWC;
 834         }
 835
 836         count = len;
 837         s = (const __uwchar_t *) *src;
 838
 839 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 840         if (ENCODING == __ctype_encoding_8_bit) {
 841                 __uwchar_t wc;
 842                 __uwchar_t u;
 843                 while (count) {
 844                         if ((wc = *s) <= 0x7f) {
 845                                 if (!(*dst = (unsigned char) wc)) {
 846                                         s = NULL;
 847                                         break;
 848                                 }
 849                         } else {
 850                                 u = 0;
 851                                 if (wc <= Cwc2c_DOMAIN_MAX) {
 852                                         u = __UCLIBC_CURLOCALE->idx8wc2c[wc >> (Cwc2c_TI_SHIFT
 853                                                                                                                 + Cwc2c_TT_SHIFT)];
 854                                         u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
 855                                                                         + ((wc >> Cwc2c_TT_SHIFT)
 856                                                                            & ((1 << Cwc2c_TI_SHIFT)-1))];
 857                                         u = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
 858                                                                         + (u << Cwc2c_TT_SHIFT)
 859                                                                         + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
 860                                 }
 861
 862 #ifdef __WCHAR_REPLACEMENT_CHAR
 863                                 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
 864 #else  /* __WCHAR_REPLACEMENT_CHAR */
 865                                 if (!u) {
 866                                         goto BAD;
 867                                 }
 868                                 *dst = (unsigned char) u;
 869 #endif /* __WCHAR_REPLACEMENT_CHAR */
 870                         }
 871                         ++s;
 872                         dst += incr;
 873                         --count;
 874                 }
 875                 if (dst != buf) {
 876                         *src = (const wchar_t *) s;
 877                 }
 878                 return len - count;
 879         }
 880 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
 881
 882 #ifdef __UCLIBC_HAS_LOCALE__
 883         assert(ENCODING == __ctype_encoding_7_bit);
 884 #endif
 885
 886         while (count) {
 887                 if (*s >= 0x80) {
 888 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
 889                 BAD:
 890 #endif
 891                         __set_errno(EILSEQ);
 892                         return (size_t) -1;
 893                 }
 894                 if ((*dst = (unsigned char) *s) == 0) {
 895                         s = NULL;
 896                         break;
 897                 }
 898                 ++s;
 899                 dst += incr;
 900                 --count;
 901         }
 902         if (dst != buf) {
 903                 *src = (const wchar_t *) s;
 904         }
 905         return len - count;
 906 }
 907 libc_hidden_def(wcsnrtombs)
 908
 909 #endif
 910 /**********************************************************************/
 911 #ifdef L_wcswidth
 912
 913
 914 #ifdef __UCLIBC_MJN3_ONLY__
 915 #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
 916 #warning TODO: Update wcwidth to match latest by Kuhn.
 917 #endif
 918
 919 #if defined(__UCLIBC_HAS_LOCALE__) && \
 920 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
 921
 922 static const unsigned char new_idx[] = {
 923         0,    5,    5,    6,   10,   15,   28,   39,
 924         48,   48,   71,   94,  113,  128,  139,  154,
 925         175,  186,  188,  188,  188,  188,  188,  188,
 926         203,  208,  208,  208,  208,  208,  208,  208,
 927         208,  219,  219,  219,  222,  222,  222,  222,
 928         222,  222,  222,  222,  222,  222,  222,  224,
 929         224,  231,  231,  231,  231,  231,  231,  231,
 930         231,  231,  231,  231,  231,  231,  231,  231,
 931         231,  231,  231,  231,  231,  231,  231,  231,
 932         231,  231,  231,  231,  231,  231,  231,  231,
 933         231,  231,  231,  231,  231,  231,  231,  231,
 934         231,  231,  231,  231,  231,  231,  231,  231,
 935         231,  231,  231,  231,  231,  231,  231,  231,
 936         231,  231,  231,  231,  231,  231,  231,  231,
 937         231,  231,  231,  231,  231,  231,  231,  231,
 938         231,  231,  231,  231,  231,  231,  231,  231,
 939         231,  231,  231,  231,  231,  231,  231,  231,
 940         231,  231,  231,  231,  231,  231,  231,  231,
 941         231,  231,  231,  231,  231,  231,  231,  231,
 942         231,  231,  231,  231,  231,  231,  231,  231,
 943         231,  231,  231,  231,  231,  233,  233,  233,
 944         233,  233,  233,  233,  234,  234,  234,  234,
 945         234,  234,  234,  234,  234,  234,  234,  234,
 946         234,  234,  234,  234,  234,  234,  234,  234,
 947         234,  234,  234,  234,  234,  234,  234,  234,
 948         234,  234,  234,  234,  234,  234,  234,  234,
 949         234,  234,  234,  234,  234,  234,  234,  234,
 950         236,  236,  236,  236,  236,  236,  236,  236,
 951         236,  236,  236,  236,  236,  236,  236,  236,
 952         236,  236,  236,  236,  236,  236,  236,  236,
 953         236,  236,  236,  236,  236,  236,  236,  236,
 954         236,  237,  237,  238,  241,  241,  242,  249,
 955         255,
 956 };
 957
 958 static const unsigned char new_tbl[] = {
 959         0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
 960         0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
 961         0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
 962         0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
 963         0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
 964         0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
 965         0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
 966         0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
 967         0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
 968         0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
 969         0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
 970         0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
 971         0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
 972         0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
 973         0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
 974         0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
 975         0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
 976         0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
 977         0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
 978         0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
 979         0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
 980         0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
 981         0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
 982         0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
 983         0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
 984         0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
 985         0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
 986         0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
 987         0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
 988         0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
 989         0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
 990         0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
 991 };
 992
 993 static const signed char new_wtbl[] = {
 994         0,   -1,    1,   -1,    1,    1,    0,    1,
 995         0,    1,    1,    0,    1,    0,    1,    1,
 996         0,    1,    0,    1,    0,    1,    0,    1,
 997         0,    1,    0,    1,    1,    0,    1,    0,
 998         1,    0,    1,    0,    1,    0,    1,    1,
 999         0,    1,    0,    1,    0,    1,    0,    1,
1000         1,    0,    1,    0,    1,    0,    1,    0,
1001         1,    0,    1,    0,    1,    0,    1,    0,
1002         1,    0,    1,    0,    1,    0,    1,    1,
1003         0,    1,    0,    1,    0,    1,    0,    1,
1004         0,    1,    0,    1,    0,    1,    0,    1,
1005         0,    1,    0,    1,    0,    1,    1,    0,
1006         1,    0,    1,    0,    1,    0,    1,    0,
1007         1,    0,    1,    0,    1,    0,    1,    0,
1008         1,    1,    0,    1,    0,    1,    0,    1,
1009         0,    1,    0,    1,    0,    1,    0,    1,
1010         1,    0,    1,    0,    1,    0,    1,    0,
1011         1,    0,    1,    1,    0,    1,    0,    1,
1012         0,    1,    0,    1,    0,    1,    0,    1,
1013         0,    1,    1,    0,    1,    0,    1,    0,
1014         1,    0,    1,    0,    1,    0,    1,    0,
1015         1,    0,    1,    0,    1,    0,    1,    1,
1016         0,    1,    0,    1,    0,    1,    0,    1,
1017         0,    1,    2,    0,    1,    0,    1,    0,
1018         1,    0,    1,    0,    1,    0,    1,    0,
1019         1,    0,    1,    1,    0,    1,    0,    1,
1020         1,    0,    1,    0,    1,    0,    1,    0,
1021         1,    0,    1,    1,    2,    1,    1,    2,
1022         2,    0,    2,    1,    2,    0,    2,    2,
1023         1,    1,    2,    1,    1,    2,    1,    0,
1024         1,    1,    0,    1,    0,    1,    2,    1,
1025         0,    2,    1,    2,    1,    0,    1,
1026 };
1027
1028
1029 int wcswidth(const wchar_t *pwcs, size_t n)
1030 {
1031         int h, l, m, count;
1032         wchar_t wc;
1033         unsigned char b;
1034
1035         if (ENCODING == __ctype_encoding_7_bit) {
1036                 size_t i;
1037
1038                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1039                         if (pwcs[i] != (pwcs[i] & 0x7f)) {
1040                                 return -1;
1041                         }
1042                 }
1043         }
1044 #ifdef __CTYPE_HAS_8_BIT_LOCALES
1045         else if (ENCODING == __ctype_encoding_8_bit) {
1046                 mbstate_t mbstate;
1047
1048                 mbstate.__mask = 0;                     /* Initialize the mbstate. */
1049                 if (wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1050                         return -1;
1051                 }
1052         }
1053 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
1054 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1055         /* For stricter handling of allowed unicode values... see comments above. */
1056         else if (ENCODING == __ctype_encoding_utf8) {
1057                 size_t i;
1058
1059                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1060                         if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1061                                  || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1062                                 ) {
1063                                 return -1;
1064                         }
1065                 }
1066         }
1067 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1068
1069         for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1070                 if (wc <= 0xff) {
1071                         /* If we're here, wc != 0. */
1072                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1073                                 return -1;
1074                         }
1075                         ++count;
1076                         continue;
1077                 }
1078                 if (((unsigned int) wc) <= 0xffff) {
1079                         b = wc & 0xff;
1080                         h = (wc >> 8);
1081                         l = new_idx[h];
1082                         h = new_idx[h+1];
1083                         while ((m = (l+h) >> 1) != l) {
1084                                 if (b >= new_tbl[m]) {
1085                                         l = m;
1086                                 } else {                /* wc < tbl[m] */
1087                                         h = m;
1088                                 }
1089                         }
1090                         count += new_wtbl[l]; /* none should be -1. */
1091                         continue;
1092                 }
1093
1094                 /* Redo this to minimize average number of compares?*/
1095                 if (wc >= 0x1d167) {
1096                         if (wc <= 0x1d1ad) {
1097                                 if ((wc <= 0x1d169
1098                                          || (wc >= 0x1d173
1099                                                  && (wc <= 0x1d182
1100                                                          || (wc >= 0x1d185
1101                                                                  && (wc <= 0x1d18b
1102                                                                          || (wc >= 0x1d1aa))))))
1103                                         ) {
1104                                         continue;
1105                                 }
1106                         } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1107                                 continue;
1108                         } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1109                                 ++count;                /* need 2.. add one here */
1110                         }
1111 #if (WCHAR_MAX > 0x7fffffffL)
1112                         else if (wc > 0x7fffffffL) {
1113                                 return -1;
1114                         }
1115 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1116                 }
1117
1118                 ++count;
1119         }
1120
1121         return count;
1122 }
1123
1124 #else  /*  __UCLIBC_HAS_LOCALE__ */
1125
1126 int wcswidth(const wchar_t *pwcs, size_t n)
1127 {
1128         int count;
1129         wchar_t wc;
1130         size_t i;
1131
1132         for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1133                 if (pwcs[i] != (pwcs[i] & 0x7f)) {
1134                         return -1;
1135                 }
1136         }
1137
1138         for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1139                 if (wc <= 0xff) {
1140                         /* If we're here, wc != 0. */
1141                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1142                                 return -1;
1143                         }
1144                         ++count;
1145                         continue;
1146                 } else {
1147                         return -1;
1148                 }
1149         }
1150
1151         return count;
1152 }
1153
1154 #endif /*  __UCLIBC_HAS_LOCALE__ */
1155
1156 libc_hidden_def(wcswidth)
1157
1158 #endif
1159 /**********************************************************************/
1160 #ifdef L_wcwidth
1161
1162
1163 int wcwidth(wchar_t wc)
1164 {
1165         return wcswidth(&wc, 1);
1166 }
1167
1168 #endif
1169 /**********************************************************************/
1170
1171
1172 typedef struct {
1173         mbstate_t tostate;
1174         mbstate_t fromstate;
1175         int tocodeset;
1176         int fromcodeset;
1177         int frombom;
1178         int tobom;
1179         int fromcodeset0;
1180         int frombom0;
1181         int tobom0;
1182         int skip_invalid_input;         /* To support iconv -c option. */
1183 } _UC_iconv_t;
1184
1185 /* For the multibyte
1186  * bit 0 means swap endian
1187  * bit 1 means 2 byte
1188  * bit 2 means 4 byte
1189  *
1190  */
1191
1192 #if defined L_iconv && defined _LIBC
1193 /* Used externally only by iconv utility */
1194 extern const unsigned char __iconv_codesets[];
1195 libc_hidden_proto(__iconv_codesets)
1196 #endif
1197
1198 #if defined L_iconv || defined L_iconv_main
1199 const unsigned char __iconv_codesets[] =
1200         "\x0a\xe0""WCHAR_T\x00"         /* superset of UCS-4 but platform-endian */
1201 #if __BYTE_ORDER == __BIG_ENDIAN
1202         "\x08\xec""UCS-4\x00"           /* always BE */
1203         "\x0a\xec""UCS-4BE\x00"
1204         "\x0a\xed""UCS-4LE\x00"
1205         "\x09\xe4""UTF-32\x00"          /* platform endian with BOM */
1206         "\x0b\xe4""UTF-32BE\x00"
1207         "\x0b\xe5""UTF-32LE\x00"
1208         "\x08\xe2""UCS-2\x00"           /* always BE */
1209         "\x0a\xe2""UCS-2BE\x00"
1210         "\x0a\xe3""UCS-2LE\x00"
1211         "\x09\xea""UTF-16\x00"          /* platform endian with BOM */
1212         "\x0b\xea""UTF-16BE\x00"
1213         "\x0b\xeb""UTF-16LE\x00"
1214 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1215         "\x08\xed""UCS-4\x00"           /* always BE */
1216         "\x0a\xed""UCS-4BE\x00"
1217         "\x0a\xec""UCS-4LE\x00"
1218         "\x09\xf4""UTF-32\x00"          /* platform endian with BOM */
1219         "\x0b\xe5""UTF-32BE\x00"
1220         "\x0b\xe4""UTF-32LE\x00"
1221         "\x08\xe3""UCS-2\x00"           /* always BE */
1222         "\x0a\xe3""UCS-2BE\x00"
1223         "\x0a\xe2""UCS-2LE\x00"
1224         "\x09\xfa""UTF-16\x00"          /* platform endian with BOM */
1225         "\x0b\xeb""UTF-16BE\x00"
1226         "\x0b\xea""UTF-16LE\x00"
1227 #endif
1228         "\x08\x02""UTF-8\x00"
1229         "\x0b\x01""US-ASCII\x00"
1230         "\x07\x01""ASCII";                      /* Must be last! (special case to save a nul) */
1231 #endif
1232 #if defined L_iconv && defined _LIBC
1233 libc_hidden_data_def(__iconv_codesets)
1234 #endif
1235
1236
1237 #ifdef L_iconv
1238
1239 #include <iconv.h>
1240 #include <string.h>
1241 #include <endian.h>
1242 #include <byteswap.h>
1243
1244 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1245 #error unsupported endianness for iconv
1246 #endif
1247
1248 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1249 #error currently iconv requires 8 bit locales
1250 #endif
1251 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1252 #error currently iconv requires UTF-8 locales
1253 #endif
1254
1255
1256 enum {
1257         IC_WCHAR_T = 0xe0,
1258         IC_MULTIBYTE = 0xe0,
1259 #if __BYTE_ORDER == __BIG_ENDIAN
1260         IC_UCS_4 =      0xec,
1261         IC_UTF_32 = 0xe4,
1262         IC_UCS_2 =      0xe2,
1263         IC_UTF_16 = 0xea,
1264 #else
1265         IC_UCS_4 =      0xed,
1266         IC_UTF_32 = 0xe5,
1267         IC_UCS_2 =      0xe3,
1268         IC_UTF_16 = 0xeb,
1269 #endif
1270         IC_UTF_8 = 2,
1271         IC_ASCII = 1
1272 };
1273
1274
1275 static int find_codeset(const char *name)
1276 {
1277         const unsigned char *s;
1278         int codeset;
1279
1280         for (s = __iconv_codesets; *s; s += *s) {
1281                 if (!strcasecmp((char*) (s + 2), name)) {
1282                         return s[1];
1283                 }
1284         }
1285
1286         /* The following is ripped from find_locale in locale.c. */
1287
1288         /* TODO: maybe CODESET_LIST + *s ??? */
1289         /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1290         codeset = 2;
1291         s = (const unsigned char *) __LOCALE_DATA_CODESET_LIST;
1292         do {
1293                 ++codeset;              /* Increment codeset first. */
1294                 if (!strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1295                         return codeset;
1296                 }
1297         } while (*++s);
1298
1299         return 0;                       /* No matching codeset! */
1300 }
1301
1302 iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1303 {
1304         register _UC_iconv_t *px;
1305         int tocodeset, fromcodeset;
1306
1307         if (((tocodeset = find_codeset(tocode)) != 0)
1308                 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1309                 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1310                         px->tocodeset = tocodeset;
1311                         px->tobom0 = px->tobom = (tocodeset >= 0xe0) ? (tocodeset & 0x10) >> 4 : 0;
1312                         px->fromcodeset0 = px->fromcodeset = fromcodeset;
1313                         px->frombom0 = px->frombom = (fromcodeset >= 0xe0) ? (fromcodeset & 0x10) >> 4 : 0;
1314                         px->skip_invalid_input = px->tostate.__mask
1315                                 = px->fromstate.__mask = 0;
1316                         return (iconv_t) px;
1317                 }
1318         } else {
1319                 __set_errno(EINVAL);
1320         }
1321         return (iconv_t)(-1);
1322 }
1323
1324 int weak_function iconv_close(iconv_t cd)
1325 {
1326         free(cd);
1327
1328         return 0;
1329 }
1330
1331 size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1332                                                    size_t *__restrict inbytesleft,
1333                                                    char **__restrict outbuf,
1334                                                    size_t *__restrict outbytesleft)
1335 {
1336         _UC_iconv_t *px = (_UC_iconv_t *) cd;
1337         size_t nrcount, r;
1338         wchar_t wc, wc2;
1339         int inci, inco;
1340
1341         assert(px != (_UC_iconv_t *)(-1));
1342         assert(sizeof(wchar_t) == 4);
1343
1344         if (!inbuf || !*inbuf) {        /* Need to reinitialze conversion state. */
1345                 /* Note: For shift-state encodings we possibly need to output the
1346                  * shift sequence to return to initial state! */
1347                 if ((px->fromcodeset & 0xf0) == 0xe0) {
1348                 }
1349                 px->tostate.__mask = px->fromstate.__mask = 0;
1350                 px->fromcodeset = px->fromcodeset0;
1351                 px->tobom = px->tobom0;
1352                 px->frombom = px->frombom0;
1353                 return 0;
1354         }
1355
1356         nrcount = 0;
1357         while (*inbytesleft) {
1358                 if (!*outbytesleft) {
1359                 TOO_BIG:
1360                         __set_errno(E2BIG);
1361                         return (size_t) -1;
1362                 }
1363
1364                 inci = inco = 1;
1365                 if (px->fromcodeset >= IC_MULTIBYTE) {
1366                         inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1367                         if (*inbytesleft < inci) goto INVALID;
1368                         wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1369                                 + ((unsigned char)((*inbuf)[1]));
1370                         if (inci == 4) {
1371                                 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1372                                         + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1373                                 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1374                         } else {
1375                                 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1376                                 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1377                                          && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1378                                         ) {                     /* surrogate */
1379                                         wc =- 0xd800U;
1380                                         if (*inbytesleft < 4) goto INVALID;
1381                                         wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1382                                                 + ((unsigned char)((*inbuf)[3]));
1383                                         if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1384                                         if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1385                                                 goto ILLEGAL;
1386                                         }
1387                                         inci = 4;       /* Change inci here in case skipping illegals. */
1388                                         wc = 0x10000UL + (wc << 10) + wc2;
1389                                 }
1390                         }
1391
1392                         if (px->frombom) {
1393                                 px->frombom = 0;
1394                                 if ((wc == 0xfeffU)
1395                                         || (wc == ((inci == 4)
1396                                                            ? (((wchar_t) 0xfffe0000UL))
1397                                                            : ((wchar_t)(0xfffeUL))))
1398                                         ) {
1399                                         if (wc != 0xfeffU) {
1400                                                 px->fromcodeset ^= 1; /* toggle endianness */
1401                                                 wc = 0xfeffU;
1402                                         }
1403                                         if (!px->frombom) {
1404                                                 goto BOM_SKIP_OUTPUT;
1405                                         }
1406                                         goto GOT_BOM;
1407                                 }
1408                         }
1409
1410                         if (px->fromcodeset != IC_WCHAR_T) {
1411                                 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1412                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1413 #ifdef KUHN
1414                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1415                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1416 #endif
1417                                         ) {
1418                                         goto ILLEGAL;
1419                                 }
1420                         }
1421                 } else if (px->fromcodeset == IC_UTF_8) {
1422                         const char *p = *inbuf;
1423                         r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1424                         if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1425                                 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1426                                         assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1427                                         if (r == (size_t)(-2)) {
1428                                         INVALID:
1429                                                 __set_errno(EINVAL);
1430                                         } else {
1431                                                 px->fromstate.__mask = 0;
1432                                                 inci = 1;
1433                                         ILLEGAL:
1434                                                 if (px->skip_invalid_input) {
1435                                                         px->skip_invalid_input = 2;     /* flag for iconv utility */
1436                                                         goto BOM_SKIP_OUTPUT;
1437                                                 }
1438                                                 __set_errno(EILSEQ);
1439                                         }
1440                                         return (size_t)(-1);
1441                                 }
1442 #ifdef __UCLIBC_MJN3_ONLY__
1443 #warning TODO: optimize this.
1444 #endif
1445                                 if (p != NULL) { /* incomplete char case */
1446                                         goto INVALID;
1447                                 }
1448                                 p = *inbuf + 1; /* nul */
1449                         }
1450                         inci = p - *inbuf;
1451                 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1452                         if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1453                                 goto ILLEGAL;
1454                         } else {                        /* some other 8-bit ascii-extension codeset */
1455                                 const __codeset_8_bit_t *c8b
1456                                         = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1457                                 wc -= 0x80;
1458                                 wc = __UCLIBC_CURLOCALE->tbl8c2wc[
1459                                                          (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1460                                                           << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1461                                 if (!wc) {
1462                                         goto ILLEGAL;
1463                                 }
1464                         }
1465                 }
1466
1467
1468                 if (px->tobom) {
1469                         inci = 0;
1470                         wc = 0xfeffU;
1471         GOT_BOM:
1472                         px->tobom = 0;
1473                 }
1474
1475                 if (px->tocodeset >= IC_MULTIBYTE) {
1476                         inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1477                         if (*outbytesleft < inco) goto TOO_BIG;
1478                         if (px->tocodeset != IC_WCHAR_T) {
1479                                 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1480                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1481 #ifdef KUHN
1482                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1483                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1484 #endif
1485                                         ) {
1486                                 REPLACE_32:
1487                                         wc = 0xfffd;
1488                                         ++nrcount;
1489                                 }
1490                         }
1491                         if (inco == 4) {
1492                                 if (px->tocodeset & 1) wc = bswap_32(wc);
1493                         } else {
1494                                 if (((__uwchar_t)wc ) > 0xffffU) {
1495                                         if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1496                                                 goto REPLACE_32;
1497                                         }
1498                                         if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1499                                         wc2 = 0xdc00U + (wc & 0x3ff);
1500                                         wc = 0xd800U + ((wc >> 10) & 0x3ff);
1501                                         if (px->tocodeset & 1) {
1502                                                 wc = bswap_16(wc);
1503                                                 wc2 = bswap_16(wc2);
1504                                         }
1505                                         wc += (wc2 << 16);
1506                                 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1507                         }
1508                         (*outbuf)[0] = (char)((unsigned char)(wc));
1509                         (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1510                         if (inco == 4) {
1511                                 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1512                                 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1513                         }
1514                 } else if (px->tocodeset == IC_UTF_8) {
1515                         const wchar_t *pw = &wc;
1516                         do {
1517                                 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1518                                 if (r != (size_t)(-1)) {
1519 #ifdef __UCLIBC_MJN3_ONLY__
1520 #warning TODO: What happens for a nul?
1521 #endif
1522                                         if (r == 0) {
1523                                                 if (wc != 0) {
1524                                                         goto TOO_BIG;
1525                                                 }
1526                                                 ++r;
1527                                         }
1528                                         break;
1529                                 }
1530                                 wc = 0xfffdU;
1531                                 ++nrcount;
1532                         } while (1);
1533                         inco = r;
1534                 } else if (((__uwchar_t)(wc)) < 0x80) {
1535                 CHAR_GOOD:
1536                                 **outbuf = wc;
1537                 } else {
1538                         if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1539                                 const __codeset_8_bit_t *c8b
1540                                         = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1541                                 __uwchar_t u;
1542                                 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1543                                 u = __UCLIBC_CURLOCALE->tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1544                                                  + ((wc >> Cwc2c_TT_SHIFT)
1545                                                         & ((1 << Cwc2c_TI_SHIFT)-1))];
1546                                 wc = __UCLIBC_CURLOCALE->tbl8wc2c[Cwc2c_TI_LEN
1547                                                  + (u << Cwc2c_TT_SHIFT)
1548                                                  + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1549                                 if (wc) {
1550                                         goto CHAR_GOOD;
1551                                 }
1552                         }
1553                         **outbuf = '?';
1554                         ++nrcount;
1555                 }
1556
1557                 *outbuf += inco;
1558                 *outbytesleft -= inco;
1559         BOM_SKIP_OUTPUT:
1560                 *inbuf += inci;
1561                 *inbytesleft -= inci;
1562         }
1563         return nrcount;
1564 }
1565 #endif