libc/misc/wchar/wchar.c

   1
   2 /*  Copyright (C) 2002, 2003, 2004     Manuel Novoa III
   3  *
   4  *  This library is free software; you can redistribute it and/or
   5  *  modify it under the terms of the GNU Library General Public
   6  *  License as published by the Free Software Foundation; either
   7  *  version 2 of the License, or (at your option) any later version.
   8  *
   9  *  This library is distributed in the hope that it will be useful,
  10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  *  Library General Public License for more details.
  13  *
  14  *  You should have received a copy of the GNU Library General Public
  15  *  License along with this library; if not, write to the Free
  16  *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17  */
  18
  19 /*  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!
  20  *
  21  *  Besides uClibc, I'm using this code in my libc for elks, which is
  22  *  a 16-bit environment with a fairly limited compiler.  It would make
  23  *  things much easier for me if this file isn't modified unnecessarily.
  24  *  In particular, please put any new or replacement functions somewhere
  25  *  else, and modify the makefile to use your version instead.
  26  *  Thanks.  Manuel
  27  *
  28  *  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION! */
  29
  30
  31 /* May 23, 2002     Initial Notes:
  32  *
  33  * I'm still tweaking this stuff, but it passes the tests I've thrown
  34  * at it, and Erik needs it for the gcc port.  The glibc extension
  35  * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
  36  * in the glibc source.  I also need to fix the behavior of
  37  * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
  38  *
  39  * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
  40  * file on my platform (x86) show about 5-10% faster conversion speed than
  41  * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
  42  * individual mbrtowc()/wcrtomb() calls.
  43  *
  44  * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
  45  * as a fail-safe UTF-8 decoder appropriate for a terminal, etc.  which
  46  * needs to deal gracefully with whatever is sent to it.  In that mode,
  47  * it passes Markus Kuhn's UTF-8-test.txt stress test.  I plan to add
  48  * an arg to force that behavior, so the interface will be changing.
  49  *
  50  * I need to fix the error checking for 16-bit wide chars.  This isn't
  51  * an issue for uClibc, but may be for ELKS.  I'm currently not sure
  52  * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
  53  *
  54  * July 1, 2002
  55  *
  56  * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
  57  * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
  58  *    locales.
  59  * Enabled building of a C/POSIX-locale-only version, so full locale support
  60  *    no longer needs to be enabled.
  61  *
  62  * Nov 4, 2002
  63  *
  64  * Fixed a bug in _wchar_wcsntoutf8s().  Don't store wcs position if dst is NULL.
  65  * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
  66  *   order to support %ls in printf.  See comments below for details.
  67  * Change behaviour of wc<->mb functions when in the C locale.  Now they do
  68  *   a 1-1 map for the range 0x80-UCHAR_MAX.  This is for backwards compatibility
  69  *   and consistency with the stds requirements that a printf format string by
  70  *   a valid multibyte string beginning and ending in it's initial shift state.
  71  *
  72  * Nov 5, 2002
  73  *
  74  * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
  75  *
  76  * Nov 7, 2002
  77  *
  78  * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
  79  *   Added some size/speed optimizations and integrated it into my locale
  80  *   framework.  Minimally tested at the moment, but the stub C-locale
  81  *   version (which most people would probably be using) should be fine.
  82  *
  83  * Nov 21, 2002
  84  *
  85  * Revert the wc<->mb changes from earlier this month involving the C-locale.
  86  * Add a couple of ugly hacks to support *wprintf.
  87  * Add a mini iconv() and iconv implementation (requires locale support).
  88  *
  89  * Aug 1, 2003
  90  * Bug fix for mbrtowc.
  91  *
  92  * Aug 18, 2003
  93  * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
  94  *
  95  * Feb 11, 2004
  96  * Bug fix: Fix size check for remaining output space in iconv().
  97  *
  98  * Manuel
  99  */
 100
 101 #define _GNU_SOURCE
 102 #define _ISOC99_SOURCE
 103 #include <errno.h>
 104 #include <stddef.h>
 105 #include <limits.h>
 106 #include <stdint.h>
 107 #include <inttypes.h>
 108 #include <stdlib.h>
 109 #include <stdio.h>
 110 #include <assert.h>
 111 #include <locale.h>
 112 #include <wchar.h>
 113 #include <bits/uClibc_uwchar.h>
 114
 115 /**********************************************************************/
 116 #ifdef __UCLIBC_HAS_LOCALE__
 117 #ifdef __UCLIBC_MJN3_ONLY__
 118 #ifdef L_iswspace
 119 /* generates one warning */
 120 #warning TODO: Fix Cc2wc* and Cwc2c* defines!
 121 #endif
 122 #endif /* __UCLIBC_MJN3_ONLY__ */
 123
 124 #define ENCODING                ((__UCLIBC_CURLOCALE_DATA).encoding)
 125
 126 #define Cc2wc_IDX_SHIFT         __LOCALE_DATA_Cc2wc_IDX_SHIFT
 127 #define Cc2wc_ROW_LEN           __LOCALE_DATA_Cc2wc_ROW_LEN
 128 #define Cwc2c_DOMAIN_MAX        __LOCALE_DATA_Cwc2c_DOMAIN_MAX
 129 #define Cwc2c_TI_SHIFT          __LOCALE_DATA_Cwc2c_TI_SHIFT
 130 #define Cwc2c_TT_SHIFT          __LOCALE_DATA_Cwc2c_TT_SHIFT
 131 #define Cwc2c_TI_LEN            __LOCALE_DATA_Cwc2c_TI_LEN
 132
 133 #ifndef __CTYPE_HAS_UTF_8_LOCALES
 134 #warning __CTYPE_HAS_UTF_8_LOCALES not set!
 135 #endif
 136
 137 #else  /* __UCLIBC_HAS_LOCALE__ */
 138
 139 #ifdef __UCLIBC_MJN3_ONLY__
 140 #ifdef L_btowc
 141 /* emit only once */
 142 #warning fix preprocessor logic testing locale settings
 143 #endif
 144 #endif
 145
 146 #define ENCODING (__ctype_encoding_7_bit)
 147 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 148 #error __CTYPE_HAS_8_BIT_LOCALES is defined!
 149 #endif
 150 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 151 #error __CTYPE_HAS_UTF_8_LOCALES is defined!
 152 #endif
 153 #undef L__wchar_utf8sntowcs
 154 #undef L__wchar_wcsntoutf8s
 155
 156 #endif /* __UCLIBC_HAS_LOCALE__ */
 157 /**********************************************************************/
 158
 159 #if WCHAR_MAX > 0xffffUL
 160 #define UTF_8_MAX_LEN 6
 161 #else
 162 #define UTF_8_MAX_LEN 3
 163 #endif
 164
 165 #define KUHN 1
 166
 167 extern size_t __mbrtowc (wchar_t *__restrict __pwc,
 168                        __const char *__restrict __s, size_t __n,
 169                        mbstate_t *__p) attribute_hidden;
 170
 171 extern size_t __wcrtomb (char *__restrict __s, wchar_t __wc,
 172                        mbstate_t *__restrict __ps) attribute_hidden;
 173
 174 /* Implementation-specific work functions. */
 175
 176 extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 177                                         const char **__restrict src, size_t n,
 178                                         mbstate_t *ps, int allow_continuation) attribute_hidden;
 179
 180 extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 181                                         const wchar_t **__restrict src, size_t wn) attribute_hidden;
 182
 183 /* glibc extensions. */
 184
 185 extern size_t __mbsnrtowcs(wchar_t *__restrict dst,
 186                                    const char **__restrict src,
 187                                    size_t NMC, size_t len, mbstate_t *__restrict ps) attribute_hidden;
 188
 189 extern size_t __wcsnrtombs(char *__restrict dst,
 190                                    const wchar_t **__restrict src,
 191                                    size_t NWC, size_t len, mbstate_t *__restrict ps) attribute_hidden;
 192
 193 /**********************************************************************/
 194 #ifdef L_btowc
 195
 196 wint_t attribute_hidden __btowc(int c)
 197 {
 198 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 199
 200         wchar_t wc;
 201         unsigned char buf[1];
 202         mbstate_t mbstate;
 203
 204         if (c != EOF) {
 205                 *buf = (unsigned char) c;
 206                 mbstate.__mask = 0;             /* Initialize the mbstate. */
 207                 if (__mbrtowc(&wc, buf, 1, &mbstate) <= 1) {
 208                         return wc;
 209                 }
 210         }
 211         return WEOF;
 212
 213 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
 214
 215 #ifdef __UCLIBC_HAS_LOCALE__
 216         assert((ENCODING == __ctype_encoding_7_bit)
 217                    || (ENCODING == __ctype_encoding_utf8));
 218 #endif /* __UCLIBC_HAS_LOCALE__ */
 219
 220         /* If we don't have 8-bit locale support, then this is trivial since
 221          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 222         return (((unsigned int)c) < 0x80) ? c : WEOF;
 223
 224 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
 225 }
 226 strong_alias(__btowc,btowc)
 227
 228 #endif
 229 /**********************************************************************/
 230 #ifdef L_wctob
 231
 232 /* Note: We completely ignore ps in all currently supported conversions. */
 233
 234 int wctob(wint_t c)
 235 {
 236 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 237
 238         unsigned char buf[MB_LEN_MAX];
 239
 240         return (__wcrtomb(buf, c, NULL) == 1) ? *buf : EOF;
 241
 242 #else  /*  __CTYPE_HAS_8_BIT_LOCALES */
 243
 244 #ifdef __UCLIBC_HAS_LOCALE__
 245         assert((ENCODING == __ctype_encoding_7_bit)
 246                    || (ENCODING == __ctype_encoding_utf8));
 247 #endif /* __UCLIBC_HAS_LOCALE__ */
 248
 249         /* If we don't have 8-bit locale support, then this is trivial since
 250          * anything outside of 0-0x7f is illegal in C/POSIX and UTF-8 locales. */
 251
 252         /* TODO: need unsigned version of wint_t... */
 253 /*      return (((unsigned int)c) < 0x80) ? c : WEOF; */
 254         return ((c >= 0) && (c < 0x80)) ? c : EOF;
 255
 256 #endif /*  __CTYPE_HAS_8_BIT_LOCALES */
 257 }
 258
 259 #endif
 260 /**********************************************************************/
 261 #ifdef L_mbsinit
 262
 263 int attribute_hidden __mbsinit(const mbstate_t *ps)
 264 {
 265         return !ps || !ps->__mask;
 266 }
 267 strong_alias(__mbsinit,mbsinit)
 268
 269 #endif
 270 /**********************************************************************/
 271 #ifdef L_mbrlen
 272
 273 size_t attribute_hidden __mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps)
 274 {
 275         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 276
 277         return __mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate);
 278 }
 279 strong_alias(__mbrlen,mbrlen)
 280
 281 #endif
 282 /**********************************************************************/
 283 #ifdef L_mbrtowc
 284
 285 size_t attribute_hidden __mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
 286                            size_t n, mbstate_t *__restrict ps)
 287 {
 288         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 289         wchar_t wcbuf[1];
 290         const char *p;
 291         size_t r;
 292         char empty_string[1];           /* Avoid static to be fPIC friendly. */
 293
 294         if (!ps) {
 295                 ps = &mbstate;
 296         }
 297
 298         if (!s) {
 299                 pwc = (wchar_t *) s;    /* NULL */
 300                 empty_string[0] = 0;    /* Init the empty string when necessary. */
 301                 s = empty_string;
 302                 n = 1;
 303         } else if (!n) {
 304                 /* TODO: change error code? */
 305                 return (ps->__mask && (ps->__wc == 0xffffU))
 306                         ? ((size_t) -1) : ((size_t) -2);
 307         }
 308
 309         p = s;
 310
 311 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 312         /* Need to do this here since mbsrtowcs doesn't allow incompletes. */
 313         if (ENCODING == __ctype_encoding_utf8) {
 314                 if (!pwc) {
 315                         pwc = wcbuf;
 316                 }
 317                 r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1);
 318                 return (r == 1) ? (p-s) : r; /* Need to return 0 if nul char. */
 319         }
 320 #endif
 321
 322 #ifdef __UCLIBC_MJN3_ONLY__
 323 #warning TODO: This adds a trailing nul!
 324 #endif /* __UCLIBC_MJN3_ONLY__ */
 325
 326         r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps);
 327
 328         if (((ssize_t) r) >= 0) {
 329                 if (pwc) {
 330                         *pwc = *wcbuf;
 331                 }
 332         }
 333         return (size_t) r;
 334 }
 335 strong_alias(__mbrtowc,mbrtowc)
 336
 337 #endif
 338 /**********************************************************************/
 339 #ifdef L_wcrtomb
 340
 341 /* Note: We completely ignore ps in all currently supported conversions. */
 342 /* TODO: Check for valid state anyway? */
 343
 344 size_t attribute_hidden __wcrtomb(register char *__restrict s, wchar_t wc,
 345                            mbstate_t *__restrict ps)
 346 {
 347 #ifdef __UCLIBC_MJN3_ONLY__
 348 #warning TODO: Should wcsnrtombs nul-terminate unconditionally?  Check glibc.
 349 #endif /* __UCLIBC_MJN3_ONLY__ */
 350         wchar_t wcbuf[1];
 351         const wchar_t *pwc;
 352         size_t r;
 353         char buf[MB_LEN_MAX];
 354
 355         if (!s) {
 356                 s = buf;
 357                 wc = 0;
 358         }
 359
 360         pwc = wcbuf;
 361         wcbuf[0] = wc;
 362
 363         r = __wcsnrtombs(s, &pwc, 1, MB_LEN_MAX, ps);
 364         return (r != 0) ? r : 1;
 365 }
 366 strong_alias(__wcrtomb,wcrtomb)
 367
 368 #endif
 369 /**********************************************************************/
 370 #ifdef L_mbsrtowcs
 371
 372 size_t attribute_hidden __mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 373                                  size_t len, mbstate_t *__restrict ps)
 374 {
 375         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 376
 377         return __mbsnrtowcs(dst, src, SIZE_MAX, len,
 378                                                 ((ps != NULL) ? ps : &mbstate));
 379 }
 380 strong_alias(__mbsrtowcs,mbsrtowcs)
 381
 382 #endif
 383 /**********************************************************************/
 384 #ifdef L_wcsrtombs
 385
 386 /* Note: We completely ignore ps in all currently supported conversions.
 387
 388  * TODO: Check for valid state anyway? */
 389
 390 size_t attribute_hidden __wcsrtombs(char *__restrict dst, const wchar_t **__restrict src,
 391                                  size_t len, mbstate_t *__restrict ps)
 392 {
 393         return __wcsnrtombs(dst, src, SIZE_MAX, len, ps);
 394 }
 395 strong_alias(__wcsrtombs,wcsrtombs)
 396
 397 #endif
 398 /**********************************************************************/
 399 #ifdef L__wchar_utf8sntowcs
 400
 401 /* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's
 402  * UTF-8-test.txt strss test.
 403  */
 404 /*  #define DECODER */
 405
 406 #ifdef DECODER
 407 #ifndef KUHN
 408 #define KUHN
 409 #endif
 410 #endif
 411
 412 size_t attribute_hidden _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn,
 413                                                   const char **__restrict src, size_t n,
 414                                                   mbstate_t *ps, int allow_continuation)
 415 {
 416         register const char *s;
 417         __uwchar_t mask;
 418         __uwchar_t wc;
 419         wchar_t wcbuf[1];
 420         size_t count;
 421         int incr;
 422
 423         s = *src;
 424
 425         assert(s != NULL);
 426         assert(ps != NULL);
 427
 428         incr = 1;
 429         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 430          * wprintf, we need to be able to compute the number of wchars needed
 431          * for the mbs conversion, not to exceed the precision specified.
 432          * But if dst is NULL, the return value is the length assuming a
 433          * sufficiently sized buffer.  So, we allow passing of (wchar_t *) ps
 434          * as pwc in order to flag that we really want the length, subject
 435          * to the restricted buffer size and no partial conversions.
 436          * See mbsnrtowcs() as well. */
 437         if (!pwc || (pwc == ((wchar_t *)ps))) {
 438                 if (!pwc) {
 439                         wn = SIZE_MAX;
 440                 }
 441                 pwc = wcbuf;
 442                 incr = 0;
 443         }
 444
 445         /* This is really here only to support the glibc extension function
 446          * __mbsnrtowcs which apparently returns 0 if wn == 0 without any
 447          * check on the validity of the mbstate. */
 448         if (!(count = wn)) {
 449                 return 0;
 450         }
 451
 452         if ((mask = (__uwchar_t) ps->__mask) != 0) { /* A continuation... */
 453 #ifdef DECODER
 454                 wc = (__uwchar_t) ps->__wc;
 455                 if (n) {
 456                         goto CONTINUE;
 457                 }
 458                 goto DONE;
 459 #else
 460                 if ((wc = (__uwchar_t) ps->__wc) != 0xffffU) {
 461                         /* TODO: change error code here and below? */
 462                         if (n) {
 463                                 goto CONTINUE;
 464                         }
 465                         goto DONE;
 466                 }
 467                 __set_errno(EILSEQ);
 468                 return (size_t) -1;             /* We're in an error state. */
 469 #endif
 470         }
 471
 472         do {
 473                 if (!n) {
 474                         goto DONE;
 475                 }
 476                 --n;
 477                 if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */
 478                         mask = 0x40;
 479 #ifdef __UCLIBC_MJN3_ONLY__
 480 #warning TODO: Fix range for 16 bit wchar_t case.
 481 #endif
 482                         if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) {
 483                                 goto START;
 484                         }
 485                 BAD:
 486 #ifdef DECODER
 487                         wc = 0xfffdU;
 488                         goto COMPLETE;
 489 #else
 490                         ps->__mask = mask;
 491                         ps->__wc = 0xffffU;
 492                         __set_errno(EILSEQ);
 493                         return (size_t) -1;     /* Illegal start byte! */
 494 #endif
 495
 496                 CONTINUE:
 497                         while (n) {
 498                                 --n;
 499                                 if ((*s & 0xc0) != 0x80) {
 500                                         goto BAD;
 501                                 }
 502                                 mask <<= 5;
 503                                 wc <<= 6;
 504                                 wc += (*s & 0x3f);      /* keep seperate for bcc (smaller code) */
 505                                 ++s;
 506                         START:
 507                                 wc &= ~(mask << 1);
 508
 509                                 if ((wc & mask) == 0) { /* Character completed. */
 510                                         if ((mask >>= 5) == 0x40) {
 511                                                 mask += mask;
 512                                         }
 513                                         /* Check for invalid sequences (longer than necessary)
 514                                          * and invalid chars.  */
 515                                         if ( (wc < mask) /* Sequence not minimal length. */
 516 #ifdef KUHN
 517 #if UTF_8_MAX_LEN == 3
 518 #error broken since mask can overflow!!
 519                                                  /* For plane 0, these are the only defined values.*/
 520                                                  || (wc > 0xfffdU)
 521 #else
 522                                                  /* Note that we don't need to worry about exceeding */
 523                                                  /* 31 bits as that is the most that UTF-8 provides. */
 524                                                  || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 525 #endif
 526                                                  || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 527 #endif /* KUHN */
 528                                                  ) {
 529                                                 goto BAD;
 530                                         }
 531                                         goto COMPLETE;
 532                                 }
 533                         }
 534                         /* Character potentially valid but incomplete. */
 535                         if (!allow_continuation) {
 536                                 if (count != wn) {
 537                                         return 0;
 538                                 }
 539                                 /* NOTE: The following can fail if you allow and then disallow
 540                                  * continuation!!! */
 541 #if UTF_8_MAX_LEN == 3
 542 #error broken since mask can overflow!!
 543 #endif
 544                                 /* Need to back up... */
 545                                 do {
 546                                         --s;
 547                                 } while ((mask >>= 5) >= 0x40);
 548                                 goto DONE;
 549                         }
 550                         ps->__mask = (wchar_t) mask;
 551                         ps->__wc = (wchar_t) wc;
 552                         *src = s;
 553                         return (size_t) -2;
 554                 }
 555         COMPLETE:
 556                 *pwc = wc;
 557                 pwc += incr;
 558         }
 559 #ifdef DECODER
 560         while (--count);
 561 #else
 562         while (wc && --count);
 563
 564         if (!wc) {
 565                 s = NULL;
 566         }
 567 #endif
 568
 569  DONE:
 570         /* ps->__wc is irrelavent here. */
 571         ps->__mask = 0;
 572         if (pwc != wcbuf) {
 573                 *src = s;
 574         }
 575
 576         return wn - count;
 577 }
 578
 579 #endif
 580 /**********************************************************************/
 581 #ifdef L__wchar_wcsntoutf8s
 582
 583 size_t attribute_hidden _wchar_wcsntoutf8s(char *__restrict s, size_t n,
 584                                                   const wchar_t **__restrict src, size_t wn)
 585 {
 586         register char *p;
 587         size_t len, t;
 588         __uwchar_t wc;
 589         const __uwchar_t *swc;
 590         int store;
 591         char buf[MB_LEN_MAX];
 592         char m;
 593
 594         store = 1;
 595         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 596          * printf, we need to be able to compute the number of bytes needed
 597          * for the mbs conversion, not to exceed the precision specified.
 598          * But if dst is NULL, the return value is the length assuming a
 599          * sufficiently sized buffer.  So, we allow passing of (char *) src
 600          * as dst in order to flag that we really want the length, subject
 601          * to the restricted buffer size and no partial conversions.
 602          * See wcsnrtombs() as well. */
 603         if (!s || (s == ((char *) src))) {
 604                 if (!s) {
 605                         n = SIZE_MAX;
 606                 }
 607             s = buf;
 608                 store = 0;
 609         }
 610
 611         t = n;
 612         swc = (const __uwchar_t *) *src;
 613
 614         assert(swc != NULL);
 615
 616         while (wn && t) {
 617                 wc = *swc;
 618
 619                 *s = wc;
 620                 len = 1;
 621
 622                 if (wc >= 0x80) {
 623 #ifdef KUHN
 624                         if (
 625 #if UTF_8_MAX_LEN == 3
 626                                 /* For plane 0, these are the only defined values.*/
 627                                 /* Note that we don't need to worry about exceeding */
 628                                 /* 31 bits as that is the most that UTF-8 provides. */
 629                                 (wc > 0xfffdU)
 630 #else
 631                                 /* UTF_8_MAX_LEN == 6 */
 632                                 (wc > 0x7fffffffUL)
 633                                 || ( ((__uwchar_t)(wc - 0xfffeU)) < 2)
 634 #endif
 635                                 || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) )
 636                                 ) {
 637                                 __set_errno(EILSEQ);
 638                                 return (size_t) -1;
 639                         }
 640 #else  /* KUHN */
 641 #if UTF_8_MAX_LEN != 3
 642                         if (wc > 0x7fffffffUL) { /* Value too large. */
 643                                 __set_errno(EILSEQ);
 644                                 return (size_t) -1;
 645                         }
 646 #endif
 647 #endif /* KUHN */
 648
 649                         wc >>= 1;
 650                         p = s;
 651                         do {
 652                                 ++p;
 653                         } while (wc >>= 5);
 654                         wc = *swc;
 655                         if ((len = p - s) > t) { /* Not enough space. */
 656                                 break;
 657                         }
 658
 659                         m = 0x80;
 660                         while( p>s ) {
 661                                 m = (m >> 1) | 0x80;
 662                                 *--p = (wc & 0x3f) | 0x80;
 663                                 wc >>= 6;
 664                         }
 665                         *s |= (m << 1);
 666                 } else if (wc == 0) {   /* End of string. */
 667                         swc = NULL;
 668                         break;
 669                 }
 670
 671                 ++swc;
 672                 --wn;
 673                 t -= len;
 674                 if (store) {
 675                         s += len;
 676                 }
 677         }
 678
 679         if (store) {
 680                 *src = (const wchar_t *) swc;
 681         }
 682
 683         return n - t;
 684 }
 685
 686
 687 #endif
 688 /**********************************************************************/
 689 #ifdef L___mbsnrtowcs
 690
 691 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 692
 693 size_t attribute_hidden __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src,
 694                                         size_t NMC, size_t len, mbstate_t *__restrict ps)
 695 {
 696         static mbstate_t mbstate;       /* Rely on bss 0-init. */
 697         wchar_t wcbuf[1];
 698         const char *s;
 699         size_t count;
 700         int incr;
 701
 702         if (!ps) {
 703                 ps = &mbstate;
 704         }
 705
 706 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 707         if (ENCODING == __ctype_encoding_utf8) {
 708                 size_t r;
 709                 return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1))
 710                                 != (size_t) -2) ? r : 0;
 711         }
 712 #endif
 713         incr = 1;
 714         /* NOTE: The following is an AWFUL HACK!  In order to support %s in
 715          * wprintf, we need to be able to compute the number of wchars needed
 716          * for the mbs conversion, not to exceed the precision specified.
 717          * But if dst is NULL, the return value is the length assuming a
 718          * sufficiently sized buffer.  So, we allow passing of ((wchar_t *)ps)
 719          * as dst in order to flag that we really want the length, subject
 720          * to the restricted buffer size and no partial conversions.
 721          * See _wchar_utf8sntowcs() as well. */
 722         if (!dst || (dst == ((wchar_t *)ps))) {
 723                 if (!dst) {
 724                         len = SIZE_MAX;
 725                 }
 726                 dst = wcbuf;
 727                 incr = 0;
 728         }
 729
 730         /* Since all the following encodings are single-byte encodings... */
 731         if (len > NMC) {
 732                 len = NMC;
 733         }
 734
 735         count = len;
 736         s = *src;
 737
 738 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 739         if (ENCODING == __ctype_encoding_8_bit) {
 740                 wchar_t wc;
 741                 while (count) {
 742                         if ((wc = ((unsigned char)(*s))) >= 0x80) {     /* Non-ASCII... */
 743                                 wc -= 0x80;
 744                                 wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
 745                                                   (__UCLIBC_CURLOCALE_DATA.idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
 746                                                    << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
 747                                 if (!wc) {
 748                                         goto BAD;
 749                                 }
 750                         }
 751                         if (!(*dst = wc)) {
 752                                 s = NULL;
 753                                 break;
 754                         }
 755                         dst += incr;
 756                         ++s;
 757                         --count;
 758                 }
 759                 if (dst != wcbuf) {
 760                         *src = s;
 761                 }
 762                 return len - count;
 763         }
 764 #endif
 765
 766 #ifdef __UCLIBC_HAS_LOCALE__
 767         assert(ENCODING == __ctype_encoding_7_bit);
 768 #endif
 769
 770         while (count) {
 771                 if ((*dst = (unsigned char) *s) == 0) {
 772                         s = NULL;
 773                         break;
 774                 }
 775                 if (*dst >= 0x80) {
 776 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 777                 BAD:
 778 #endif
 779                         __set_errno(EILSEQ);
 780                         return (size_t) -1;
 781                 }
 782                 ++s;
 783                 dst += incr;
 784                 --count;
 785         }
 786         if (dst != wcbuf) {
 787                 *src = s;
 788         }
 789         return len - count;
 790 }
 791 weak_alias(__mbsnrtowcs,mbsnrtowcs)
 792
 793 #endif
 794 /**********************************************************************/
 795 #ifdef L___wcsnrtombs
 796
 797 /* WARNING: We treat len as SIZE_MAX when dst is NULL! */
 798
 799 /* Note: We completely ignore ps in all currently supported conversions.
 800  * TODO: Check for valid state anyway? */
 801
 802 size_t attribute_hidden __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src,
 803                                         size_t NWC, size_t len, mbstate_t *__restrict ps)
 804 {
 805         const __uwchar_t *s;
 806         size_t count;
 807         int incr;
 808         char buf[MB_LEN_MAX];
 809
 810 #ifdef __CTYPE_HAS_UTF_8_LOCALES
 811         if (ENCODING == __ctype_encoding_utf8) {
 812                 return _wchar_wcsntoutf8s(dst, len, src, NWC);
 813         }
 814 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
 815
 816         incr = 1;
 817         /* NOTE: The following is an AWFUL HACK!  In order to support %ls in
 818          * printf, we need to be able to compute the number of bytes needed
 819          * for the mbs conversion, not to exceed the precision specified.
 820          * But if dst is NULL, the return value is the length assuming a
 821          * sufficiently sized buffer.  So, we allow passing of (char *) src
 822          * as dst in order to flag that we really want the length, subject
 823          * to the restricted buffer size and no partial conversions.
 824          * See _wchar_wcsntoutf8s() as well. */
 825         if (!dst || (dst == ((char *) src))) {
 826                 if (!dst) {
 827                         len = SIZE_MAX;
 828                 }
 829                 dst = buf;
 830                 incr = 0;
 831         }
 832
 833         /* Since all the following encodings are single-byte encodings... */
 834         if (len > NWC) {
 835                 len = NWC;
 836         }
 837
 838         count = len;
 839         s = (const __uwchar_t *) *src;
 840
 841 #ifdef __CTYPE_HAS_8_BIT_LOCALES
 842         if (ENCODING == __ctype_encoding_8_bit) {
 843                 __uwchar_t wc;
 844                 __uwchar_t u;
 845                 while (count) {
 846                         if ((wc = *s) <= 0x7f) {
 847                                 if (!(*dst = (unsigned char) wc)) {
 848                                         s = NULL;
 849                                         break;
 850                                 }
 851                         } else {
 852                                 u = 0;
 853                                 if (wc <= Cwc2c_DOMAIN_MAX) {
 854                                         u = __UCLIBC_CURLOCALE_DATA.idx8wc2c[wc >> (Cwc2c_TI_SHIFT
 855                                                                                                                 + Cwc2c_TT_SHIFT)];
 856                                         u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
 857                                                                         + ((wc >> Cwc2c_TT_SHIFT)
 858                                                                            & ((1 << Cwc2c_TI_SHIFT)-1))];
 859                                         u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
 860                                                                         + (u << Cwc2c_TT_SHIFT)
 861                                                                         + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
 862                                 }
 863
 864 #define __WCHAR_REPLACEMENT_CHAR '?'
 865 #ifdef __WCHAR_REPLACEMENT_CHAR
 866                                 *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR );
 867 #else  /* __WCHAR_REPLACEMENT_CHAR */
 868                                 if (!u) {
 869                                         goto BAD;
 870                                 }
 871                                 *dst = (unsigned char) u;
 872 #endif /* __WCHAR_REPLACEMENT_CHAR */
 873                         }
 874                         ++s;
 875                         dst += incr;
 876                         --count;
 877                 }
 878                 if (dst != buf) {
 879                         *src = (const wchar_t *) s;
 880                 }
 881                 return len - count;
 882         }
 883 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
 884
 885 #ifdef __UCLIBC_HAS_LOCALE__
 886         assert(ENCODING == __ctype_encoding_7_bit);
 887 #endif
 888
 889         while (count) {
 890                 if (*s >= 0x80) {
 891 #if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR)
 892                 BAD:
 893 #endif
 894                         __set_errno(EILSEQ);
 895                         return (size_t) -1;
 896                 }
 897                 if ((*dst = (unsigned char) *s) == 0) {
 898                         s = NULL;
 899                         break;
 900                 }
 901                 ++s;
 902                 dst += incr;
 903                 --count;
 904         }
 905         if (dst != buf) {
 906                 *src = (const wchar_t *) s;
 907         }
 908         return len - count;
 909 }
 910 weak_alias(__wcsnrtombs,wcsnrtombs)
 911
 912 #endif
 913 /**********************************************************************/
 914 #ifdef L_wcswidth
 915
 916 #ifdef __UCLIBC_MJN3_ONLY__
 917 #warning REMINDER: If we start doing translit, wcwidth and wcswidth will need updating.
 918 #warning TODO: Update wcwidth to match latest by Kuhn.
 919 #endif
 920
 921 #if defined(__UCLIBC_HAS_LOCALE__) && \
 922 ( defined(__CTYPE_HAS_8_BIT_LOCALES) || defined(__CTYPE_HAS_UTF_8_LOCALES) )
 923
 924 static const unsigned char new_idx[] = {
 925         0,    5,    5,    6,   10,   15,   28,   39,
 926         48,   48,   71,   94,  113,  128,  139,  154,
 927         175,  186,  188,  188,  188,  188,  188,  188,
 928         203,  208,  208,  208,  208,  208,  208,  208,
 929         208,  219,  219,  219,  222,  222,  222,  222,
 930         222,  222,  222,  222,  222,  222,  222,  224,
 931         224,  231,  231,  231,  231,  231,  231,  231,
 932         231,  231,  231,  231,  231,  231,  231,  231,
 933         231,  231,  231,  231,  231,  231,  231,  231,
 934         231,  231,  231,  231,  231,  231,  231,  231,
 935         231,  231,  231,  231,  231,  231,  231,  231,
 936         231,  231,  231,  231,  231,  231,  231,  231,
 937         231,  231,  231,  231,  231,  231,  231,  231,
 938         231,  231,  231,  231,  231,  231,  231,  231,
 939         231,  231,  231,  231,  231,  231,  231,  231,
 940         231,  231,  231,  231,  231,  231,  231,  231,
 941         231,  231,  231,  231,  231,  231,  231,  231,
 942         231,  231,  231,  231,  231,  231,  231,  231,
 943         231,  231,  231,  231,  231,  231,  231,  231,
 944         231,  231,  231,  231,  231,  231,  231,  231,
 945         231,  231,  231,  231,  231,  233,  233,  233,
 946         233,  233,  233,  233,  234,  234,  234,  234,
 947         234,  234,  234,  234,  234,  234,  234,  234,
 948         234,  234,  234,  234,  234,  234,  234,  234,
 949         234,  234,  234,  234,  234,  234,  234,  234,
 950         234,  234,  234,  234,  234,  234,  234,  234,
 951         234,  234,  234,  234,  234,  234,  234,  234,
 952         236,  236,  236,  236,  236,  236,  236,  236,
 953         236,  236,  236,  236,  236,  236,  236,  236,
 954         236,  236,  236,  236,  236,  236,  236,  236,
 955         236,  236,  236,  236,  236,  236,  236,  236,
 956         236,  237,  237,  238,  241,  241,  242,  249,
 957         255,
 958 };
 959
 960 static const unsigned char new_tbl[] = {
 961         0x00, 0x01, 0x20, 0x7f, 0xa0, 0x00, 0x00, 0x50,
 962         0x60, 0x70, 0x00, 0x83, 0x87, 0x88, 0x8a, 0x00,
 963         0x91, 0xa2, 0xa3, 0xba, 0xbb, 0xbe, 0xbf, 0xc0,
 964         0xc1, 0xc3, 0xc4, 0xc5, 0x00, 0x4b, 0x56, 0x70,
 965         0x71, 0xd6, 0xe5, 0xe7, 0xe9, 0xea, 0xee, 0x00,
 966         0x0f, 0x10, 0x11, 0x12, 0x30, 0x4b, 0xa6, 0xb1,
 967         0x00, 0x01, 0x03, 0x3c, 0x3d, 0x41, 0x49, 0x4d,
 968         0x4e, 0x51, 0x55, 0x62, 0x64, 0x81, 0x82, 0xbc,
 969         0xbd, 0xc1, 0xc5, 0xcd, 0xce, 0xe2, 0xe4, 0x00,
 970         0x02, 0x03, 0x3c, 0x3d, 0x41, 0x43, 0x47, 0x49,
 971         0x4b, 0x4e, 0x70, 0x72, 0x81, 0x83, 0xbc, 0xbd,
 972         0xc1, 0xc6, 0xc7, 0xc9, 0xcd, 0xce, 0x00, 0x01,
 973         0x02, 0x3c, 0x3d, 0x3f, 0x40, 0x41, 0x44, 0x4d,
 974         0x4e, 0x56, 0x57, 0x82, 0x83, 0xc0, 0xc1, 0xcd,
 975         0xce, 0x00, 0x3e, 0x41, 0x46, 0x49, 0x4a, 0x4e,
 976         0x55, 0x57, 0xbf, 0xc0, 0xc6, 0xc7, 0xcc, 0xce,
 977         0x00, 0x41, 0x44, 0x4d, 0x4e, 0xca, 0xcb, 0xd2,
 978         0xd5, 0xd6, 0xd7, 0x00, 0x31, 0x32, 0x34, 0x3b,
 979         0x47, 0x4f, 0xb1, 0xb2, 0xb4, 0xba, 0xbb, 0xbd,
 980         0xc8, 0xce, 0x00, 0x18, 0x1a, 0x35, 0x36, 0x37,
 981         0x38, 0x39, 0x3a, 0x71, 0x7f, 0x80, 0x85, 0x86,
 982         0x88, 0x90, 0x98, 0x99, 0xbd, 0xc6, 0xc7, 0x00,
 983         0x2d, 0x31, 0x32, 0x33, 0x36, 0x38, 0x39, 0x3a,
 984         0x58, 0x5a, 0x00, 0x60, 0x00, 0x12, 0x15, 0x32,
 985         0x35, 0x52, 0x54, 0x72, 0x74, 0xb7, 0xbe, 0xc6,
 986         0xc7, 0xc9, 0xd4, 0x00, 0x0b, 0x0f, 0xa9, 0xaa,
 987         0x00, 0x0b, 0x10, 0x2a, 0x2f, 0x60, 0x64, 0x6a,
 988         0x70, 0xd0, 0xeb, 0x00, 0x29, 0x2b, 0x00, 0x80,
 989         0x00, 0x2a, 0x30, 0x3f, 0x40, 0x99, 0x9b, 0x00,
 990         0xd0, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x1e,
 991         0x1f, 0x00, 0x00, 0x10, 0x20, 0x24, 0x30, 0x70,
 992         0xff, 0x00, 0x61, 0xe0, 0xe7, 0xf9, 0xfc,
 993 };
 994
 995 static const signed char new_wtbl[] = {
 996         0,   -1,    1,   -1,    1,    1,    0,    1,
 997         0,    1,    1,    0,    1,    0,    1,    1,
 998         0,    1,    0,    1,    0,    1,    0,    1,
 999         0,    1,    0,    1,    1,    0,    1,    0,
1000         1,    0,    1,    0,    1,    0,    1,    1,
1001         0,    1,    0,    1,    0,    1,    0,    1,
1002         1,    0,    1,    0,    1,    0,    1,    0,
1003         1,    0,    1,    0,    1,    0,    1,    0,
1004         1,    0,    1,    0,    1,    0,    1,    1,
1005         0,    1,    0,    1,    0,    1,    0,    1,
1006         0,    1,    0,    1,    0,    1,    0,    1,
1007         0,    1,    0,    1,    0,    1,    1,    0,
1008         1,    0,    1,    0,    1,    0,    1,    0,
1009         1,    0,    1,    0,    1,    0,    1,    0,
1010         1,    1,    0,    1,    0,    1,    0,    1,
1011         0,    1,    0,    1,    0,    1,    0,    1,
1012         1,    0,    1,    0,    1,    0,    1,    0,
1013         1,    0,    1,    1,    0,    1,    0,    1,
1014         0,    1,    0,    1,    0,    1,    0,    1,
1015         0,    1,    1,    0,    1,    0,    1,    0,
1016         1,    0,    1,    0,    1,    0,    1,    0,
1017         1,    0,    1,    0,    1,    0,    1,    1,
1018         0,    1,    0,    1,    0,    1,    0,    1,
1019         0,    1,    2,    0,    1,    0,    1,    0,
1020         1,    0,    1,    0,    1,    0,    1,    0,
1021         1,    0,    1,    1,    0,    1,    0,    1,
1022         1,    0,    1,    0,    1,    0,    1,    0,
1023         1,    0,    1,    1,    2,    1,    1,    2,
1024         2,    0,    2,    1,    2,    0,    2,    2,
1025         1,    1,    2,    1,    1,    2,    1,    0,
1026         1,    1,    0,    1,    0,    1,    2,    1,
1027         0,    2,    1,    2,    1,    0,    1,
1028 };
1029
1030 int attribute_hidden __wcswidth(const wchar_t *pwcs, size_t n)
1031 {
1032     int h, l, m, count;
1033     wchar_t wc;
1034     unsigned char b;
1035
1036         if (ENCODING == __ctype_encoding_7_bit) {
1037                 size_t i;
1038
1039                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1040                         if (pwcs[i] != ((unsigned char)(pwcs[i]))) {
1041                                 return -1;
1042                         }
1043                 }
1044         }
1045 #ifdef __CTYPE_HAS_8_BIT_LOCALES
1046         else if (ENCODING == __ctype_encoding_8_bit) {
1047                 mbstate_t mbstate;
1048
1049                 mbstate.__mask = 0;                     /* Initialize the mbstate. */
1050                 if (__wcsnrtombs(NULL, &pwcs, n, SIZE_MAX, &mbstate) == ((size_t) - 1)) {
1051                         return -1;
1052                 }
1053         }
1054 #endif /* __CTYPE_HAS_8_BIT_LOCALES */
1055 #if defined(__CTYPE_HAS_UTF_8_LOCALES) && defined(KUHN)
1056         /* For stricter handling of allowed unicode values... see comments above. */
1057         else if (ENCODING == __ctype_encoding_utf8) {
1058                 size_t i;
1059
1060                 for (i = 0 ; (i < n) && pwcs[i] ; i++) {
1061                         if ( (((__uwchar_t)((pwcs[i]) - 0xfffeU)) < 2)
1062                                  || (((__uwchar_t)((pwcs[i]) - 0xd800U)) < (0xe000U - 0xd800U))
1063                                 ) {
1064                                 return -1;
1065                         }
1066                 }
1067         }
1068 #endif /* __CTYPE_HAS_UTF_8_LOCALES */
1069
1070     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1071                 if (wc <= 0xff) {
1072                         /* If we're here, wc != 0. */
1073                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1074                                 return -1;
1075                         }
1076                         ++count;
1077                         continue;
1078                 }
1079                 if (((unsigned int) wc) <= 0xffff) {
1080                         b = wc & 0xff;
1081                         h = (wc >> 8);
1082                         l = new_idx[h];
1083                         h = new_idx[h+1];
1084                         while ((m = (l+h) >> 1) != l) {
1085                                 if (b >= new_tbl[m]) {
1086                                         l = m;
1087                                 } else {                /* wc < tbl[m] */
1088                                         h = m;
1089                                 }
1090                         }
1091                         count += new_wtbl[l]; /* none should be -1. */
1092                         continue;
1093                 }
1094
1095                 /* Redo this to minimize average number of compares?*/
1096                 if (wc >= 0x1d167) {
1097                         if (wc <= 0x1d1ad) {
1098                                 if ((wc <= 0x1d169
1099                                          || (wc >= 0x1d173
1100                                                  && (wc <= 0x1d182
1101                                                          || (wc >= 0x1d185
1102                                                                  && (wc <= 0x1d18b
1103                                                                          || (wc >= 0x1d1aa))))))
1104                                         ) {
1105                                         continue;
1106                                 }
1107                         } else if (((wc >= 0xe0020) && (wc <= 0xe007f)) || (wc == 0xe0001)) {
1108                                 continue;
1109                         } else if ((wc >= 0x20000) && (wc <= 0x2ffff)) {
1110                                 ++count;                /* need 2.. add one here */
1111                         }
1112 #if (WCHAR_MAX > 0x7fffffffL)
1113                         else if (wc > 0x7fffffffL) {
1114                                 return -1;
1115                         }
1116 #endif /* (WCHAR_MAX > 0x7fffffffL) */
1117                 }
1118
1119                 ++count;
1120     }
1121
1122     return count;
1123 }
1124
1125 #else  /*  __UCLIBC_HAS_LOCALE__ */
1126
1127 int attribute_hidden __wcswidth(const wchar_t *pwcs, size_t n)
1128 {
1129         int count;
1130         wchar_t wc;
1131
1132     for (count = 0 ; n && (wc = *pwcs++) ; n--) {
1133                 if (wc <= 0xff) {
1134                         /* If we're here, wc != 0. */
1135                         if ((wc < 32) || ((wc >= 0x7f) && (wc < 0xa0))) {
1136                                 return -1;
1137                         }
1138                         ++count;
1139                         continue;
1140                 } else {
1141                         return -1;
1142                 }
1143         }
1144
1145         return count;
1146 }
1147
1148 #endif /*  __UCLIBC_HAS_LOCALE__ */
1149
1150 strong_alias(__wcswidth,wcswidth)
1151
1152 #endif
1153 /**********************************************************************/
1154 #ifdef L_wcwidth
1155
1156 extern int __wcswidth (__const wchar_t *__s, size_t __n) attribute_hidden;
1157
1158 int wcwidth(wchar_t wc)
1159 {
1160     return __wcswidth(&wc, 1);
1161 }
1162
1163 #endif
1164 /**********************************************************************/
1165
1166
1167 typedef struct {
1168         mbstate_t tostate;
1169         mbstate_t fromstate;
1170         int tocodeset;
1171         int fromcodeset;
1172         int frombom;
1173         int tobom;
1174         int fromcodeset0;
1175         int frombom0;
1176         int tobom0;
1177         int skip_invalid_input;         /* To support iconv -c option. */
1178 } _UC_iconv_t;
1179
1180
1181
1182 #ifdef L_iconv
1183
1184 #include <iconv.h>
1185 #include <string.h>
1186 #include <endian.h>
1187 #include <byteswap.h>
1188
1189 #if (__BYTE_ORDER != __BIG_ENDIAN) && (__BYTE_ORDER != __LITTLE_ENDIAN)
1190 #error unsupported endianness for iconv
1191 #endif
1192
1193 #ifndef __CTYPE_HAS_8_BIT_LOCALES
1194 #error currently iconv requires 8 bit locales
1195 #endif
1196 #ifndef __CTYPE_HAS_UTF_8_LOCALES
1197 #error currently iconv requires UTF-8 locales
1198 #endif
1199
1200
1201 enum {
1202         IC_WCHAR_T = 0xe0,
1203         IC_MULTIBYTE = 0xe0,
1204 #if __BYTE_ORDER == __BIG_ENDIAN
1205         IC_UCS_4 =      0xec,
1206         IC_UTF_32 = 0xe4,
1207         IC_UCS_2 =      0xe2,
1208         IC_UTF_16 = 0xea,
1209 #else
1210         IC_UCS_4 =      0xed,
1211         IC_UTF_32 = 0xe5,
1212         IC_UCS_2 =      0xe3,
1213         IC_UTF_16 = 0xeb,
1214 #endif
1215         IC_UTF_8 = 2,
1216         IC_ASCII = 1
1217 };
1218
1219 /* For the multibyte
1220  * bit 0 means swap endian
1221  * bit 1 means 2 byte
1222  * bit 2 means 4 byte
1223  *
1224  */
1225
1226 const unsigned char __iconv_codesets[] =
1227         "\x0a\xe0""WCHAR_T\x00"         /* superset of UCS-4 but platform-endian */
1228 #if __BYTE_ORDER == __BIG_ENDIAN
1229         "\x08\xec""UCS-4\x00"           /* always BE */
1230         "\x0a\xec""UCS-4BE\x00"
1231         "\x0a\xed""UCS-4LE\x00"
1232         "\x09\fe4""UTF-32\x00"          /* platform endian with BOM */
1233         "\x0b\xe4""UTF-32BE\x00"
1234         "\x0b\xe5""UTF-32LE\x00"
1235         "\x08\xe2""UCS-2\x00"           /* always BE */
1236         "\x0a\xe2""UCS-2BE\x00"
1237         "\x0a\xe3""UCS-2LE\x00"
1238         "\x09\xea""UTF-16\x00"          /* platform endian with BOM */
1239         "\x0b\xea""UTF-16BE\x00"
1240         "\x0b\xeb""UTF-16LE\x00"
1241 #elif __BYTE_ORDER == __LITTLE_ENDIAN
1242         "\x08\xed""UCS-4\x00"           /* always BE */
1243         "\x0a\xed""UCS-4BE\x00"
1244         "\x0a\xec""UCS-4LE\x00"
1245         "\x09\xf4""UTF-32\x00"          /* platform endian with BOM */
1246         "\x0b\xe5""UTF-32BE\x00"
1247         "\x0b\xe4""UTF-32LE\x00"
1248         "\x08\xe3""UCS-2\x00"           /* always BE */
1249         "\x0a\xe3""UCS-2BE\x00"
1250         "\x0a\xe2""UCS-2LE\x00"
1251         "\x09\xfa""UTF-16\x00"          /* platform endian with BOM */
1252         "\x0b\xeb""UTF-16BE\x00"
1253         "\x0b\xea""UTF-16LE\x00"
1254 #endif
1255         "\x08\x02""UTF-8\x00"
1256         "\x0b\x01""US-ASCII\x00"
1257         "\x07\x01""ASCII";                      /* Must be last! (special case to save a nul) */
1258
1259 static int find_codeset(const char *name)
1260 {
1261         const unsigned char *s;
1262         int codeset;
1263
1264         for (s = __iconv_codesets ; *s ; s += *s) {
1265                 if (!__strcasecmp(s+2, name)) {
1266                         return s[1];
1267                 }
1268         }
1269
1270         /* The following is ripped from find_locale in locale.c. */
1271
1272         /* TODO: maybe CODESET_LIST + *s ??? */
1273         /* 7bit is 1, UTF-8 is 2, 8-bit is >= 3 */
1274         codeset = 2;
1275         s = __LOCALE_DATA_CODESET_LIST;
1276         do {
1277                 ++codeset;              /* Increment codeset first. */
1278                 if (!__strcasecmp(__LOCALE_DATA_CODESET_LIST+*s, name)) {
1279                         return codeset;
1280                 }
1281         } while (*++s);
1282
1283         return 0;                       /* No matching codeset! */
1284 }
1285
1286 iconv_t weak_function iconv_open(const char *tocode, const char *fromcode)
1287 {
1288         register _UC_iconv_t *px;
1289         int tocodeset, fromcodeset;
1290
1291         if (((tocodeset = find_codeset(tocode)) != 0)
1292                 && ((fromcodeset = find_codeset(fromcode)) != 0)) {
1293                 if ((px = malloc(sizeof(_UC_iconv_t))) != NULL) {
1294                         px->tocodeset = tocodeset;
1295                         px->tobom0 = px->tobom = (tocodeset & 0x10) >> 4;
1296                         px->fromcodeset0 = px->fromcodeset = fromcodeset;
1297                         px->frombom0 = px->frombom = (fromcodeset & 0x10) >> 4;
1298                         px->skip_invalid_input = px->tostate.__mask
1299                                 = px->fromstate.__mask = 0;
1300                         return (iconv_t) px;
1301                 }
1302         } else {
1303                 __set_errno(EINVAL);
1304         }
1305         return (iconv_t)(-1);
1306 }
1307
1308 int weak_function iconv_close(iconv_t cd)
1309 {
1310         free(cd);
1311
1312         return 0;
1313 }
1314
1315 size_t weak_function iconv(iconv_t cd, char **__restrict inbuf,
1316                                                    size_t *__restrict inbytesleft,
1317                                                    char **__restrict outbuf,
1318                                                    size_t *__restrict outbytesleft)
1319 {
1320         _UC_iconv_t *px = (_UC_iconv_t *) cd;
1321         size_t nrcount, r;
1322         wchar_t wc, wc2;
1323         int inci, inco;
1324
1325         assert(px != (_UC_iconv_t *)(-1));
1326         assert(sizeof(wchar_t) == 4);
1327
1328         if (!inbuf || !*inbuf) {        /* Need to reinitialze conversion state. */
1329                 /* Note: For shift-state encodings we possibly need to output the
1330                  * shift sequence to return to initial state! */
1331                 if ((px->fromcodeset & 0xf0) == 0xe0) {
1332                 }
1333                 px->tostate.__mask = px->fromstate.__mask = 0;
1334                 px->fromcodeset = px->fromcodeset0;
1335                 px->tobom = px->tobom0;
1336                 px->frombom = px->frombom0;
1337                 return 0;
1338         }
1339
1340         nrcount = 0;
1341         while (*inbytesleft) {
1342                 if (!*outbytesleft) {
1343                 TOO_BIG:
1344                         __set_errno(E2BIG);
1345                         return (size_t) -1;
1346                 }
1347
1348                 inci = inco = 1;
1349                 if (px->fromcodeset >= IC_MULTIBYTE) {
1350                         inci = (px->fromcodeset == IC_WCHAR_T) ? 4: (px->fromcodeset & 6);
1351                         if (*inbytesleft < inci) goto INVALID;
1352                         wc = (((unsigned int)((unsigned char)((*inbuf)[0]))) << 8)
1353                                 + ((unsigned char)((*inbuf)[1]));
1354                         if (inci == 4) {
1355                                 wc = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1356                                         + ((unsigned char)((*inbuf)[3])) + (wc << 16);
1357                                 if (!(px->fromcodeset & 1)) wc = bswap_32(wc);
1358                         } else {
1359                                 if (!(px->fromcodeset & 1)) wc = bswap_16(wc);
1360                                 if (((px->fromcodeset & IC_UTF_16) == IC_UTF_16)
1361                                          && (((__uwchar_t)(wc - 0xd800U)) < (0xdc00U - 0xd800U))
1362                                         ) {                     /* surrogate */
1363                                         wc =- 0xd800U;
1364                                         if (*inbytesleft < 4) goto INVALID;
1365                                         wc2 = (((unsigned int)((unsigned char)((*inbuf)[2]))) << 8)
1366                                                 + ((unsigned char)((*inbuf)[3]));
1367                                         if (!(px->fromcodeset & 1)) wc = bswap_16(wc2);
1368                                         if (((__uwchar_t)(wc2 -= 0xdc00U)) < (0xe0000U - 0xdc00U)) {
1369                                                 goto ILLEGAL;
1370                                         }
1371                                         inci = 4;       /* Change inci here in case skipping illegals. */
1372                                         wc = 0x10000UL + (wc << 10) + wc2;
1373                                 }
1374                         }
1375
1376                         if (px->frombom) {
1377                                 px->frombom = 0;
1378                                 if ((wc == 0xfeffU)
1379                                         || (wc == ((inci == 4)
1380                                                            ? (((wchar_t) 0xfffe0000UL))
1381                                                            : ((wchar_t)(0xfffeUL))))
1382                                         ) {
1383                                         if (wc != 0xfeffU) {
1384                                                 px->fromcodeset ^= 1; /* toggle endianness */
1385                                                 wc = 0xfeffU;
1386                                         }
1387                                         if (!px->frombom) {
1388                                                 goto BOM_SKIP_OUTPUT;
1389                                         }
1390                                         goto GOT_BOM;
1391                                 }
1392                         }
1393
1394                         if (px->fromcodeset != IC_WCHAR_T) {
1395                                 if (((__uwchar_t) wc) > (((px->fromcodeset & IC_UCS_4) == IC_UCS_4)
1396                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1397 #ifdef KUHN
1398                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1399                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1400 #endif
1401                                         ) {
1402                                         goto ILLEGAL;
1403                                 }
1404                         }
1405                 } else if (px->fromcodeset == IC_UTF_8) {
1406                         const char *p = *inbuf;
1407                         r = _wchar_utf8sntowcs(&wc, 1, &p, *inbytesleft, &px->fromstate, 0);
1408                         if (((ssize_t) r) <= 0) { /* either EILSEQ or incomplete or nul */
1409                                 if (((ssize_t) r) < 0) { /* either EILSEQ or incomplete or nul */
1410                                         assert((r == (size_t)(-1)) || (r == (size_t)(-2)));
1411                                         if (r == (size_t)(-2)) {
1412                                         INVALID:
1413                                                 __set_errno(EINVAL);
1414                                         } else {
1415                                                 px->fromstate.__mask = 0;
1416                                                 inci = 1;
1417                                         ILLEGAL:
1418                                                 if (px->skip_invalid_input) {
1419                                                         px->skip_invalid_input = 2;     /* flag for iconv utility */
1420                                                         goto BOM_SKIP_OUTPUT;
1421                                                 }
1422                                                 __set_errno(EILSEQ);
1423                                         }
1424                                         return (size_t)(-1);
1425                                 }
1426 #ifdef __UCLIBC_MJN3_ONLY__
1427 #warning TODO: optimize this.
1428 #endif
1429                                 if (p != NULL) { /* incomplete char case */
1430                                         goto INVALID;
1431                                 }
1432                                 p = *inbuf + 1; /* nul */
1433                         }
1434                         inci = p - *inbuf;
1435                 } else if ((wc = ((unsigned char)(**inbuf))) >= 0x80) { /* Non-ASCII... */
1436                         if (px->fromcodeset == IC_ASCII) { /* US-ASCII codeset */
1437                                 goto ILLEGAL;
1438                         } else {                        /* some other 8-bit ascii-extension codeset */
1439                                 const __codeset_8_bit_t *c8b
1440                                         = __locale_mmap->codeset_8_bit + px->fromcodeset - 3;
1441                                 wc -= 0x80;
1442                                 wc = __UCLIBC_CURLOCALE_DATA.tbl8c2wc[
1443                                                          (c8b->idx8c2wc[wc >> Cc2wc_IDX_SHIFT]
1444                                                           << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))];
1445                                 if (!wc) {
1446                                         goto ILLEGAL;
1447                                 }
1448                         }
1449                 }
1450
1451
1452                 if (px->tobom) {
1453                         inci = 0;
1454                         wc = 0xfeffU;
1455         GOT_BOM:
1456                         px->tobom = 0;
1457                 }
1458
1459                 if (px->tocodeset >= IC_MULTIBYTE) {
1460                         inco = (px->tocodeset == IC_WCHAR_T) ? 4: (px->tocodeset & 6);
1461                         if (*outbytesleft < inco) goto TOO_BIG;
1462                         if (px->tocodeset != IC_WCHAR_T) {
1463                                 if (((__uwchar_t) wc) > (((px->tocodeset & IC_UCS_4) == IC_UCS_4)
1464                                                                                  ? 0x7fffffffUL : 0x10ffffUL)
1465 #ifdef KUHN
1466                                         || (((__uwchar_t)(wc - 0xfffeU)) < 2)
1467                                         || (((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U))
1468 #endif
1469                                         ) {
1470                                 REPLACE_32:
1471                                         wc = 0xfffd;
1472                                         ++nrcount;
1473                                 }
1474                         }
1475                         if (inco == 4) {
1476                                 if (px->tocodeset & 1) wc = bswap_32(wc);
1477                         } else {
1478                                 if (((__uwchar_t)wc ) > 0xffffU) {
1479                                         if ((px->tocodeset & IC_UTF_16) != IC_UTF_16) {
1480                                                 goto REPLACE_32;
1481                                         }
1482                                         if (*outbytesleft < (inco = 4)) goto TOO_BIG;
1483                                         wc2 = 0xdc00U + (wc & 0x3ff);
1484                                         wc = 0xd800U + ((wc >> 10) & 0x3ff);
1485                                         if (px->tocodeset & 1) {
1486                                                 wc = bswap_16(wc);
1487                                                 wc2 = bswap_16(wc2);
1488                                         }
1489                                         wc += (wc2 << 16);
1490                                 } else if (px->tocodeset & 1) wc = bswap_16(wc);
1491                         }
1492                         (*outbuf)[0] = (char)((unsigned char)(wc));
1493                         (*outbuf)[1] = (char)((unsigned char)(wc >> 8));
1494                         if (inco == 4) {
1495                                 (*outbuf)[2] = (char)((unsigned char)(wc >> 16));
1496                                 (*outbuf)[3] = (char)((unsigned char)(wc >> 24));
1497                         }
1498                 } else if (px->tocodeset == IC_UTF_8) {
1499                         const wchar_t *pw = &wc;
1500                         do {
1501                                 r = _wchar_wcsntoutf8s(*outbuf, *outbytesleft, &pw, 1);
1502                                 if (r != (size_t)(-1)) {
1503 #ifdef __UCLIBC_MJN3_ONLY__
1504 #warning TODO: What happens for a nul?
1505 #endif
1506                                         if (r == 0) {
1507                                                 if (wc != 0) {
1508                                                         goto TOO_BIG;
1509                                                 }
1510                                                 ++r;
1511                                         }
1512                                         break;
1513                                 }
1514                                 wc = 0xfffdU;
1515                                 ++nrcount;
1516                         } while (1);
1517                         inco = r;
1518                 } else if (((__uwchar_t)(wc)) < 0x80) {
1519                 CHAR_GOOD:
1520                                 **outbuf = wc;
1521                 } else {
1522                         if ((px->tocodeset != 0x01) && (wc <= Cwc2c_DOMAIN_MAX)) {
1523                                 const __codeset_8_bit_t *c8b
1524                                         = __locale_mmap->codeset_8_bit + px->tocodeset - 3;
1525                                 __uwchar_t u;
1526                                 u = c8b->idx8wc2c[wc >> (Cwc2c_TI_SHIFT + Cwc2c_TT_SHIFT)];
1527                                 u = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[(u << Cwc2c_TI_SHIFT)
1528                                                  + ((wc >> Cwc2c_TT_SHIFT)
1529                                                         & ((1 << Cwc2c_TI_SHIFT)-1))];
1530                                 wc = __UCLIBC_CURLOCALE_DATA.tbl8wc2c[Cwc2c_TI_LEN
1531                                                  + (u << Cwc2c_TT_SHIFT)
1532                                                  + (wc & ((1 << Cwc2c_TT_SHIFT)-1))];
1533                                 if (wc) {
1534                                         goto CHAR_GOOD;
1535                                 }
1536                         }
1537                         **outbuf = '?';
1538                         ++nrcount;
1539                 }
1540
1541                 *outbuf += inco;
1542                 *outbytesleft -= inco;
1543         BOM_SKIP_OUTPUT:
1544                 *inbuf += inci;
1545                 *inbytesleft -= inci;
1546         }
1547         return nrcount;
1548 }
1549
1550 #endif
1551 /**********************************************************************/
1552 #ifdef L_iconv_main
1553
1554 #include <stdio.h>
1555 #include <stdlib.h>
1556 #include <string.h>
1557 #include <wchar.h>
1558 #include <iconv.h>
1559 #include <stdarg.h>
1560 #include <libgen.h>
1561
1562 extern const unsigned char __iconv_codesets[];
1563
1564 #define IBUF BUFSIZ
1565 #define OBUF BUFSIZ
1566
1567 char *progname;
1568 int hide_errors;
1569
1570 static void error_msg(const char *fmt, ...)
1571          __attribute__ ((noreturn, format (printf, 1, 2)));
1572
1573 static void error_msg(const char *fmt, ...)
1574 {
1575         va_list arg;
1576
1577         if (!hide_errors) {
1578                 fprintf(stderr, "%s: ", progname);
1579                 va_start(arg, fmt);
1580                 vfprintf(stderr, fmt, arg);
1581                 va_end(arg);
1582         }
1583
1584         __exit(EXIT_FAILURE);
1585 }
1586
1587 int main(int argc, char **argv)
1588 {
1589         FILE *ifile;
1590         FILE *ofile = stdout;
1591         const char *p;
1592         const char *s;
1593         static const char opt_chars[] = "tfocsl";
1594                                       /* 012345 */
1595         const char *opts[sizeof(opt_chars)]; /* last is infile name */
1596         iconv_t ic;
1597         char ibuf[IBUF];
1598         char obuf[OBUF];
1599         char *pi;
1600         char *po;
1601         size_t ni, no, r, pos;
1602
1603         hide_errors = 0;
1604
1605         for (s = opt_chars ; *s ; s++) {
1606                 opts[ s - opt_chars ] = NULL;
1607         }
1608
1609         progname = *argv;
1610         while (--argc) {
1611                 p = *++argv;
1612                 if ((*p != '-') || (*++p == 0)) {
1613                         break;
1614                 }
1615                 do {
1616                         if ((s = strchr(opt_chars,*p)) == NULL) {
1617                         USAGE:
1618                                 s = basename(progname);
1619                                 fprintf(stderr,
1620                                                 "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
1621                                                 "  or\n%s -l\n", s, s);
1622                                 return EXIT_FAILURE;
1623                         }
1624                         if ((s - opt_chars) < 3) {
1625                                 if ((--argc == 0) || opts[s - opt_chars]) {
1626                                         goto USAGE;
1627                                 }
1628                                 opts[s - opt_chars] = *++argv;
1629                         } else {
1630                                 opts[s - opt_chars] = p;
1631                         }
1632                 } while (*++p);
1633         }
1634
1635         if (opts[5]) {                          /* -l */
1636                 fprintf(stderr, "Recognized codesets:\n");
1637                 for (s = __iconv_codesets ; *s ; s += *s) {
1638                         fprintf(stderr,"  %s\n", s+2);
1639                 }
1640                 s = __LOCALE_DATA_CODESET_LIST;
1641                 do {
1642                         fprintf(stderr,"  %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s));
1643                 } while (*++s);
1644
1645                 return EXIT_SUCCESS;
1646         }
1647
1648         if (opts[4]) {
1649                 hide_errors = 1;
1650         }
1651
1652         if (!opts[0] || !opts[1]) {
1653                 goto USAGE;
1654         }
1655         if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
1656                 error_msg( "unsupported codeset in %s -> %s conversion\n", opts[0], opts[1]);
1657         }
1658         if (opts[3]) {                          /* -c */
1659                 ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
1660         }
1661
1662         if ((s = opts[2]) != NULL) {
1663                 if (!(ofile = fopen(s, "w"))) {
1664                         error_msg( "couldn't open %s for writing\n", s);
1665                 }
1666         }
1667
1668         pos = ni = 0;
1669         do {
1670                 if (!argc || ((**argv == '-') && !((*argv)[1]))) {
1671                         ifile = stdin;          /* we don't check for duplicates */
1672                 } else if (!(ifile = fopen(*argv, "r"))) {
1673                         error_msg( "couldn't open %s for reading\n", *argv);
1674                 }
1675
1676                 while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
1677                         pos += r;
1678                         ni += r;
1679                         no = OBUF;
1680                         pi = ibuf;
1681                         po = obuf;
1682                         if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
1683                                 if ((errno != EINVAL) && (errno != E2BIG)) {
1684                                         error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
1685                                 }
1686                         }
1687                         if ((r = OBUF - no) > 0) {
1688                                 if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
1689                                         error_msg( "write error\n");
1690                                 }
1691                         }
1692                         if (ni) {                       /* still bytes in buffer! */
1693                                 memmove(ibuf, pi, ni);
1694                         }
1695                 }
1696
1697                 if (ferror(ifile)) {
1698                         error_msg( "read error\n");
1699                 }
1700
1701                 ++argv;
1702
1703                 if (ifile != stdin) {
1704                         fclose(ifile);
1705                 }
1706
1707         } while (--argc > 0);
1708
1709         iconv_close(ic);
1710
1711         if (ni) {
1712                 error_msg( "incomplete sequence\n");
1713         }
1714
1715         return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
1716                 ? EXIT_SUCCESS : EXIT_FAILURE;
1717 }
1718
1719 #endif
1720 /**********************************************************************/